zf

zenflows testing
git clone https://s.sonu.ch/~srfsh/zf.git
Log | Files | Refs | Submodules | README | LICENSE

elixir_lexer.ex (18779B)


      1 defmodule Makeup.Lexers.ElixirLexer do
      2   @moduledoc """
      3   A `Makeup` lexer for the Elixir language.
      4   """
      5 
      6   import NimbleParsec
      7   import Makeup.Lexer.Combinators
      8   import Makeup.Lexer.Groups
      9   import Makeup.Lexers.ElixirLexer.Helper
     10 
     11   @behaviour Makeup.Lexer
     12 
     13   ###################################################################
     14   # Step #1: tokenize the input (into a list of tokens)
     15   ###################################################################
     16   # We will often compose combinators into larger combinators.
     17   # Sometimes, the smaller combinator is useful on its own as a token, and sometimes it isn't.
     18   # We'll adopt the following "convention":
     19   #
     20   # 1. A combinator that ends with `_name` returns a string
     21   # 2. Other combinators will *usually* return a token
     22   #
     23   # Why this convention? Tokens can't be composed further, while raw strings can.
     24   # This way, we immediately know which of the combinators we can compose.
     25   # TODO: check we're following this convention
     26   # NOTE: if Elixir had a good static type system it would help us do the right thing here.
     27 
     28   whitespace = ascii_string([?\r, ?\s, ?\n, ?\f], min: 1) |> token(:whitespace)
     29 
     30   newlines =
     31     choice([string("\r\n"), string("\n")])
     32     |> optional(ascii_string([?\s, ?\n, ?\f, ?\r], min: 1))
     33     |> token(:whitespace)
     34 
     35   any_char = utf8_char([]) |> token(:error)
     36 
     37   # Numbers
     38   digits = ascii_string([?0..?9], min: 1)
     39   bin_digits = ascii_string([?0..?1], min: 1)
     40   hex_digits = ascii_string([?0..?9, ?a..?f, ?A..?F], min: 1)
     41   oct_digits = ascii_string([?0..?7], min: 1)
     42   # Digits in an integer may be separated by underscores
     43   number_bin_part = with_optional_separator(bin_digits, "_")
     44   number_oct_part = with_optional_separator(oct_digits, "_")
     45   number_hex_part = with_optional_separator(hex_digits, "_")
     46   integer = with_optional_separator(digits, "_")
     47 
     48   # Tokens for the lexer
     49   number_bin = string("0b") |> concat(number_bin_part) |> token(:number_bin)
     50   number_oct = string("0o") |> concat(number_oct_part) |> token(:number_oct)
     51   number_hex = string("0x") |> concat(number_hex_part) |> token(:number_hex)
     52   # Base 10
     53   number_integer = token(integer, :number_integer)
     54 
     55   # Floating point numbers
     56   float_scientific_notation_part =
     57     ascii_string([?e, ?E], 1)
     58     |> optional(string("-"))
     59     |> concat(integer)
     60 
     61   number_float =
     62     integer
     63     |> string(".")
     64     |> concat(integer)
     65     |> optional(float_scientific_notation_part)
     66     |> token(:number_float)
     67 
     68   variable_name =
     69     parsec({Makeup.Lexers.ElixirLexer.Variables, :variable_start_chars})
     70     |> repeat(parsec({Makeup.Lexers.ElixirLexer.Variables, :variable_continue_chars}))
     71     |> optional(utf8_char([??, ?!]))
     72 
     73   variable =
     74     variable_name
     75     |> lexeme
     76     |> token(:name)
     77 
     78   # TODO: as above
     79   alias_part =
     80     ascii_string([?A..?Z], 1)
     81     |> optional(ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1))
     82 
     83   module_name = alias_part |> concat(repeat(string(".") |> concat(alias_part)))
     84 
     85   module = token(module_name, :name_class)
     86 
     87   operator_name = word_from_list(~W(
     88       <<< >>> ||| &&& ^^^ ~~~ === !== ~>> <~> |~> <|>
     89       == != <= >= && || \\ <> ++ -- |> =~ -> <- ~> <~ :: ..
     90       = < > + - * / | . ^ & ! //
     91     ))
     92 
     93   operator = token(operator_name, :operator)
     94 
     95   # The lexer needs to test these before `>>` and `<<`
     96   bitshifts =
     97     word_from_list(~W(<<< >>>))
     98     |> token(:operator)
     99 
    100   special_atom_name = word_from_list(~W(... <<>> %{} % {} ..//))
    101 
    102   triple_dot = token("...", :name)
    103 
    104   map_arrow = token("=>", :punctuation)
    105 
    106   anon_function_arguments =
    107     string("&")
    108     |> concat(digits)
    109     |> token(:name_entity)
    110 
    111   normal_char =
    112     string("?")
    113     |> utf8_string([], 1)
    114     |> token(:string_char)
    115 
    116   escape_char =
    117     string("?\\")
    118     |> utf8_string([], 1)
    119     |> token(:string_char)
    120 
    121   special_atom =
    122     string(":")
    123     |> concat(special_atom_name)
    124     |> token(:string_symbol)
    125 
    126   attribute =
    127     string("@")
    128     |> concat(variable_name)
    129     |> token(:name_attribute)
    130 
    131   punctuation =
    132     word_from_list(
    133       [":", ";", ",", ".", "%"],
    134       :punctuation
    135     )
    136 
    137   # Combinators that highlight elixir expressions surrounded by a pair of delimiters.
    138   # Most of the time, the delimiters can be described by symple characters, but the
    139   # combinator that parses a struct is more complex
    140   interpolation = many_surrounded_by(parsec(:root_element), "\#{", "}", :string_interpol)
    141   tuple = many_surrounded_by(parsec(:root_element), "{", "}")
    142 
    143   binary_inside_opaque_struct = many_surrounded_by(parsec(:root_element), "<<", ">>")
    144   # Only for the IEx lexer (it's not valid Elixir code):
    145   opaque_struct =
    146     many_surrounded_by(
    147       choice([
    148         binary_inside_opaque_struct,
    149         parsec(:root_element)
    150       ]),
    151       token("#", :punctuation) |> concat(module) |> concat(token("<", :punctuation)),
    152       token(">", :punctuation)
    153     )
    154 
    155   delimiters_punctuation =
    156     word_from_list(
    157       ~W( ( \) [ ] << >>),
    158       :punctuation
    159     )
    160 
    161   map = many_surrounded_by(parsec(:root_element), "%{", "}")
    162 
    163   delimiter_pairs = [
    164     delimiters_punctuation,
    165     tuple,
    166     map
    167   ]
    168 
    169   normal_atom_name =
    170     parsec({Makeup.Lexers.ElixirLexer.Atoms, :atom_start_chars})
    171     |> repeat(parsec({Makeup.Lexers.ElixirLexer.Atoms, :atom_continue_chars}))
    172     |> optional(utf8_char([??, ?!]))
    173 
    174   normal_atom =
    175     string(":")
    176     |> choice([operator_name, normal_atom_name])
    177     |> token(:string_symbol)
    178 
    179   unicode_char_in_string =
    180     string("\\u")
    181     |> ascii_string([?0..?9, ?a..?f, ?A..?F], 4)
    182     |> token(:string_escape)
    183 
    184   escaped_char =
    185     string("\\")
    186     |> utf8_string([], 1)
    187     |> token(:string_escape)
    188 
    189   # We must support iex prompts inside a string, sigil or heredoc.
    190   # For example:
    191   #
    192   #   iex(1)> a = """
    193   #   ...(1)> line1
    194   #   ...(1)> line2
    195   #   ...(1)> """
    196   #
    197   # Inside the string we don't expect the `iex>` prompt, only the `...>` prompt.
    198   iex_prompt_inside_string =
    199     string("\n...")
    200     |> optional(string("(") |> concat(digits) |> string(")"))
    201     |> string(">")
    202     |> optional(string(" "))
    203     |> token(:generic_prompt, %{selectable: false})
    204 
    205   combinators_inside_string = [
    206     unicode_char_in_string,
    207     escaped_char,
    208     interpolation,
    209     iex_prompt_inside_string
    210   ]
    211 
    212   string_atom =
    213     choice([
    214       string_like(":\"", "\"", combinators_inside_string, :string_symbol),
    215       string_like(":'", "'", combinators_inside_string, :string_symbol)
    216     ])
    217 
    218   atom =
    219     choice([
    220       special_atom,
    221       normal_atom,
    222       string_atom
    223     ])
    224 
    225   string_keyword =
    226     choice([
    227       string_like("\"", "\"", combinators_inside_string, :string_symbol),
    228       string_like("'", "'", combinators_inside_string, :string_symbol)
    229     ])
    230     |> concat(token(string(":"), :punctuation))
    231 
    232   normal_keyword =
    233     choice([operator_name, normal_atom_name])
    234     |> token(:string_symbol)
    235     |> concat(token(string(":"), :punctuation))
    236 
    237   keyword =
    238     choice([
    239       normal_keyword,
    240       string_keyword
    241     ])
    242     |> lookahead(whitespace)
    243 
    244   sigil_delimiters = [
    245     {~S["""], ~S["""]},
    246     {"'''", "'''"},
    247     {"\"", "\""},
    248     {"'", "'"},
    249     {"/", "/"},
    250     {"{", "}"},
    251     {"[", "]"},
    252     {"(", ")"},
    253     {"<", ">"},
    254     {"|", "|"}
    255   ]
    256 
    257   sigils_interpol =
    258     for {ldelim, rdelim} <- sigil_delimiters do
    259       sigil(ldelim, rdelim, [?a..?z], combinators_inside_string)
    260     end
    261 
    262   sigils_no_interpol =
    263     for {ldelim, rdelim} <- sigil_delimiters do
    264       sigil(ldelim, rdelim, [?A..?Z], [escape_delim(rdelim), iex_prompt_inside_string])
    265     end
    266 
    267   all_sigils = sigils_interpol ++ sigils_no_interpol
    268 
    269   double_quoted_string_interpol = string_like("\"", "\"", combinators_inside_string, :string)
    270   single_quoted_string_interpol = string_like("'", "'", combinators_inside_string, :string_char)
    271   double_quoted_heredocs = string_like(~S["""], ~S["""], combinators_inside_string, :string)
    272   single_quoted_heredocs = string_like("'''", "'''", combinators_inside_string, :string_char)
    273 
    274   # `#PID<123.456.789>`
    275   pid =
    276     token("#", :punctuation)
    277     |> concat(token("PID", :name_class))
    278     |> concat(token("<", :punctuation))
    279     |> concat(number_integer)
    280     |> concat(token(".", :operator))
    281     |> concat(number_integer)
    282     |> concat(token(".", :operator))
    283     |> concat(number_integer)
    284     |> concat(token(">", :punctuation))
    285 
    286   line = repeat(lookahead_not(ascii_char([?\n])) |> utf8_string([], 1))
    287 
    288   inline_comment =
    289     string("#")
    290     |> concat(line)
    291     |> token(:comment_single)
    292 
    293   # An IEx prompt is supported in the normal Elixir lexer because false positives
    294   # would be extremely rare
    295   iex_prompt =
    296     choice([string("iex"), string("...")])
    297     |> optional(string("(") |> concat(digits) |> string(")"))
    298     |> string(">")
    299     |> optional(string(" "))
    300     |> token(:generic_prompt, %{selectable: false})
    301 
    302   stacktrace =
    303     string("** (")
    304     # The rest of the line is part of the traceback
    305     |> concat(line)
    306     # All lines indented by 4 spaces are part of the traceback
    307     |> repeat(string("\n    ") |> concat(line))
    308     |> token(:generic_traceback)
    309 
    310   root_element_combinator =
    311     choice(
    312       [
    313         # START of IEx-specific tokens
    314         # IEx prompt must come before names
    315         newlines |> choice([iex_prompt, stacktrace]),
    316         # a PID is a special kind of opaque struct
    317         pid,
    318         # Opaque struct (must come before inline comments)
    319         opaque_struct,
    320         # END of IEx-specific tokens
    321         whitespace,
    322         # Comments
    323         inline_comment,
    324         # Syntax sugar for keyword lists (must come before variables and strings)
    325         keyword,
    326         # Strings and sigils
    327         double_quoted_heredocs,
    328         single_quoted_heredocs,
    329         double_quoted_string_interpol,
    330         single_quoted_string_interpol
    331       ] ++
    332         all_sigils ++
    333         [
    334           # Chars
    335           escape_char,
    336           normal_char,
    337           # Atoms
    338           atom,
    339           # Module attributes
    340           attribute,
    341           # Anonymous function arguments (must come before the operators)
    342           anon_function_arguments,
    343           # Bitwise operators must match first
    344           bitshifts
    345           # Matching delimiters
    346         ] ++
    347         delimiter_pairs ++
    348         [
    349           # Triple dot (must come before operators)
    350           triple_dot,
    351           # Map arrow (must come before operators)
    352           map_arrow,
    353           # Operators
    354           operator,
    355           # Numbers
    356           number_bin,
    357           number_oct,
    358           number_hex,
    359           # Floats must come before integers
    360           number_float,
    361           number_integer,
    362           # Names
    363           variable,
    364           # Module names
    365           module,
    366           punctuation,
    367           # If we can't parse any of the above, we highlight the next character as an error
    368           # and proceed from there.
    369           # A lexer should always consume any string given as input.
    370           any_char
    371         ]
    372     )
    373 
    374   # By default, don't inline the lexers.
    375   # Inlining them increases performance by ~20%
    376   # at the cost of doubling the compilation times...
    377   @inline false
    378 
    379   @doc false
    380   def __as_elixir_language__({ttype, meta, value}) do
    381     {ttype, Map.put(meta, :language, :elixir), value}
    382   end
    383 
    384   # Semi-public API: these two functions can be used by someone who wants to
    385   # embed an Elixir lexer into another lexer, but other than that, they are not
    386   # meant to be used by end-users.
    387 
    388   # @impl Makeup.Lexer
    389   defparsec(
    390     :root_element,
    391     root_element_combinator |> map({__MODULE__, :__as_elixir_language__, []}),
    392     inline: @inline,
    393     export_combinator: true
    394   )
    395 
    396   # @impl Makeup.Lexer
    397   defparsec(
    398     :root,
    399     repeat(parsec(:root_element)),
    400     inline: @inline,
    401     export_combinator: true
    402   )
    403 
    404   ###################################################################
    405   # Step #2: postprocess the list of tokens
    406   ###################################################################
    407 
    408   @def_like ~W[def defp defmacro defmacrop defguard defguardp defn defnp]
    409   @keyword_declaration @def_like ++ ~W[
    410     defmodule defprotocol defdelegate defexception defstruct defimpl]
    411   @keyword ~W[
    412     fn do end after else rescue catch with
    413     case cond for if unless try receive raise
    414     quote unquote unquote_splicing throw super]
    415   @operator_word ~W[not and or when in]
    416   @keyword_namespace ~W[import require use alias]
    417   @name_constant ~W[nil true false]
    418   @name_builtin_pseudo ~W[_ __MODULE__ __DIR__ __ENV__ __CALLER__]
    419 
    420   # The `postprocess/1` function will require a major redesign when we decide to support
    421   # custom `def`-like keywords supplied by the user.
    422   defp postprocess_helper([]), do: []
    423 
    424   # In an expression such as:
    425   #
    426   #    def a + b, do: nil
    427   #
    428   # the variable_name `a` is a parameter for the `+/2` operator.
    429   # It should not be highlighted as a function name.
    430   # for that, we must scan a little further (one additional token) for the operator.
    431   defp postprocess_helper([
    432          {:name, attrs1, text1},
    433          {:whitespace, _, _} = ws1,
    434          {:name, _, text2} = param,
    435          {:whitespace, _, _} = ws2,
    436          {:operator, _, _} = op
    437          | tokens
    438        ])
    439        when text1 in @def_like and text2 != "unquote" do
    440     [{:keyword_declaration, attrs1, text1}, ws1, param, ws2, op | postprocess_helper(tokens)]
    441   end
    442 
    443   # The same as above without whitespace
    444   defp postprocess_helper([
    445          {:name, attrs1, text1},
    446          {:whitespace, _, _} = ws,
    447          {:name, _, text2} = param,
    448          {:operator, _, _} = op
    449          | tokens
    450        ])
    451        when text1 in @def_like and text2 != "unquote" do
    452     [{:keyword_declaration, attrs1, text1}, ws, param, op | postprocess_helper(tokens)]
    453   end
    454 
    455   # If we're matching this branch, we already know that this is not an operator definition.
    456   # We can highlight the variable_name after the function name as a function name.
    457   defp postprocess_helper([
    458          {:name, attrs1, text1},
    459          {:whitespace, _, _} = ws,
    460          {:name, attrs2, text2} | tokens
    461        ])
    462        when text1 in @def_like and text2 != "unquote" do
    463     [
    464       {:keyword_declaration, attrs1, text1},
    465       ws,
    466       {:name_function, attrs2, text2} | postprocess_helper(tokens)
    467     ]
    468   end
    469 
    470   # When calling functions from an erlang module, highlight the atom as a module.
    471   #
    472   #     :crypto.strong_rand_bytes(4)
    473   defp postprocess_helper([
    474          {:string_symbol, attrs1, [":" | _] = module},
    475          {:operator, _, "."} = op,
    476          {:name, _, _} = text
    477          | tokens
    478        ]) do
    479     [{:name_class, attrs1, module}, op, text | postprocess_helper(tokens)]
    480   end
    481 
    482   defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @keyword,
    483     do: [{:keyword, attrs, text} | postprocess_helper(tokens)]
    484 
    485   defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @keyword_declaration,
    486     do: [{:keyword_declaration, attrs, text} | postprocess_helper(tokens)]
    487 
    488   defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @operator_word,
    489     do: [{:operator_word, attrs, text} | postprocess_helper(tokens)]
    490 
    491   defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @keyword_namespace,
    492     do: [{:keyword_namespace, attrs, text} | postprocess_helper(tokens)]
    493 
    494   defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @name_constant,
    495     do: [{:name_constant, attrs, text} | postprocess_helper(tokens)]
    496 
    497   defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @name_builtin_pseudo,
    498     do: [{:name_builtin_pseudo, attrs, text} | postprocess_helper(tokens)]
    499 
    500   # Unused variables
    501   defp postprocess_helper([{:name, attrs, "_" <> _name = text} | tokens]),
    502     do: [{:comment, attrs, text} | postprocess_helper(tokens)]
    503 
    504   # Otherwise, don't do anything with the current token and go to the next token.
    505   defp postprocess_helper([token | tokens]), do: [token | postprocess_helper(tokens)]
    506 
    507   # Public API
    508   @impl Makeup.Lexer
    509   def postprocess(tokens, _opts \\ []), do: postprocess_helper(tokens)
    510 
    511   ###################################################################
    512   # Step #3: highlight matching delimiters
    513   ###################################################################
    514 
    515   @impl Makeup.Lexer
    516   defgroupmatcher(:match_groups,
    517     do_end: [
    518       open: [
    519         [{:keyword, %{language: :elixir}, "do"}]
    520       ],
    521       middle: [
    522         [{:keyword, %{language: :elixir}, "else"}],
    523         [{:keyword, %{language: :elixir}, "catch"}],
    524         [{:keyword, %{language: :elixir}, "rescue"}],
    525         [{:keyword, %{language: :elixir}, "after"}]
    526       ],
    527       close: [
    528         [{:keyword, %{language: :elixir}, "end"}]
    529       ]
    530     ],
    531     fn_end: [
    532       open: [[{:keyword, %{language: :elixir}, "fn"}]],
    533       close: [[{:keyword, %{language: :elixir}, "end"}]]
    534     ],
    535     parentheses: [
    536       open: [[{:punctuation, %{language: :elixir}, "("}]],
    537       close: [[{:punctuation, %{language: :elixir}, ")"}]]
    538     ],
    539     list: [
    540       open: [
    541         [{:punctuation, %{language: :elixir}, "["}]
    542       ],
    543       close: [
    544         [{:punctuation, %{language: :elixir}, "]"}]
    545       ]
    546     ],
    547     tuple: [
    548       open: [
    549         [{:punctuation, %{language: :elixir}, "{"}]
    550       ],
    551       close: [
    552         [{:punctuation, %{language: :elixir}, "}"}]
    553       ]
    554     ],
    555     map: [
    556       open: [
    557         [{:punctuation, %{language: :elixir}, "%{"}]
    558       ],
    559       close: [
    560         [{:punctuation, %{language: :elixir}, "}"}]
    561       ]
    562     ],
    563     struct: [
    564       open: [
    565         [
    566           {:punctuation, %{language: :elixir}, "%"},
    567           {:name_class, %{language: :elixir}, _},
    568           {:punctuation, %{language: :elixir}, "{"}
    569         ]
    570       ],
    571       close: [
    572         [{:punctuation, %{language: :elixir}, "}"}]
    573       ]
    574     ],
    575     opaque_struct: [
    576       open: [
    577         [
    578           {:punctuation, %{language: :elixir}, "#"},
    579           {:name_class, %{language: :elixir}, _},
    580           {:punctuation, %{language: :elixir}, "<"}
    581         ]
    582       ],
    583       close: [
    584         [{:punctuation, %{language: :elixir}, ">"}]
    585       ]
    586     ],
    587     binaries: [
    588       open: [
    589         [{:punctuation, %{language: :elixir}, "<<"}]
    590       ],
    591       close: [
    592         [{:punctuation, %{language: :elixir}, ">>"}]
    593       ]
    594     ],
    595     interpolation: [
    596       open: [
    597         [{:string_interpol, %{language: :elixir}, "\#{"}]
    598       ],
    599       close: [
    600         [{:string_interpol, %{language: :elixir}, "}"}]
    601       ]
    602     ]
    603   )
    604 
    605   defp remove_initial_newline([{ttype, meta, text} | tokens]) do
    606     case to_string(text) do
    607       "\n" -> tokens
    608       "\n" <> rest -> [{ttype, meta, rest} | tokens]
    609     end
    610   end
    611 
    612   # Finally, the public API for the lexer
    613   @impl Makeup.Lexer
    614   def lex(text, opts \\ []) do
    615     group_prefix = Keyword.get(opts, :group_prefix, random_prefix(10))
    616     {:ok, tokens, "", _, _, _} = root("\n" <> text)
    617 
    618     tokens
    619     |> remove_initial_newline()
    620     |> postprocess([])
    621     |> match_groups(group_prefix)
    622   end
    623 end