zf

zenflows testing
git clone https://s.sonu.ch/~srfsh/zf.git
Log | Files | Refs | Submodules | README | LICENSE

erlang_lexer.ex (11087B)


      1 defmodule Makeup.Lexers.ErlangLexer do
      2   @moduledoc """
      3   A `Makeup` lexer for the `Erlang` language.
      4   """
      5 
      6   @behaviour Makeup.Lexer
      7 
      8   import NimbleParsec
      9   import Makeup.Lexer.Combinators
     10   import Makeup.Lexer.Groups
     11 
     12   ###################################################################
     13   # Step #1: tokenize the input (into a list of tokens)
     14   ###################################################################
     15 
     16   whitespace = ascii_string([?\s, ?\f, ?\n], min: 1) |> token(:whitespace)
     17 
     18   # This is the combinator that ensures that the lexer will never reject a file
     19   # because of invalid input syntax
     20   any_char = utf8_char([]) |> token(:error)
     21 
     22   comment =
     23     ascii_char([?%])
     24     |> optional(utf8_string([not: ?\n], min: 1))
     25     |> token(:comment_single)
     26 
     27   hashbang =
     28     string("\n#!")
     29     |> utf8_string([not: ?\n], min: 1)
     30     |> string("\n")
     31     |> token(:comment_hashbang)
     32 
     33   escape_octal = ascii_string([?0..?7], min: 1, max: 3)
     34 
     35   escape_char = ascii_char([?\b, ?\d, ?\e, ?\f, ?\n, ?\r, ?\s, ?\t, ?\v, ?\', ?\", ?\\])
     36 
     37   escape_hex =
     38     choice([
     39       string("x") |> ascii_string([?0..?9, ?a..?f, ?A..?F], 2),
     40       string("x{") |> ascii_string([?0..?9, ?a..?f, ?A..?F], min: 1) |> string("}")
     41     ])
     42 
     43   escape_ctrl = string("^") |> ascii_char([?a..?z, ?A..?Z])
     44 
     45   escape =
     46     choice([
     47       escape_char,
     48       escape_octal,
     49       escape_hex,
     50       escape_ctrl
     51     ])
     52 
     53   numeric_base =
     54     choice([
     55       ascii_char([?1..?2]) |> ascii_char([?0..?9]),
     56       string("3") |> ascii_char([?0..?6]),
     57       ascii_char([?2..?9])
     58     ])
     59 
     60   # Numbers
     61   digits = ascii_string([?0..?9], min: 1)
     62 
     63   number_integer =
     64     optional(ascii_char([?+, ?-]))
     65     |> concat(digits)
     66     |> token(:number_integer)
     67 
     68   number_integer_in_weird_base =
     69     optional(ascii_char([?+, ?-]))
     70     |> concat(numeric_base)
     71     |> string("#")
     72     |> ascii_string([?0..?9, ?a..?z, ?A..?Z], min: 1)
     73     |> token(:number_integer)
     74 
     75   # Floating point numbers
     76   float_scientific_notation_part =
     77     ascii_string([?e, ?E], 1)
     78     |> optional(string("-"))
     79     |> concat(digits)
     80 
     81   number_float =
     82     optional(ascii_char([?+, ?-]))
     83     |> concat(digits)
     84     |> string(".")
     85     |> concat(digits)
     86     |> optional(float_scientific_notation_part)
     87     |> token(:number_float)
     88 
     89   variable_name =
     90     ascii_string([?A..?Z, ?_], 1)
     91     |> optional(ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1))
     92 
     93   simple_atom_name =
     94     ascii_string([?a..?z], 1)
     95     |> optional(ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1))
     96     |> reduce({Enum, :join, []})
     97 
     98   single_quote_escape = string("\\'")
     99 
    100   quoted_atom_name_middle =
    101     lookahead_not(string("'"))
    102     |> choice([
    103       single_quote_escape,
    104       utf8_string([not: ?\n, not: ?', not: ?\\], min: 1),
    105       escape
    106     ])
    107 
    108   quoted_atom_name =
    109     string("'")
    110     |> repeat(quoted_atom_name_middle)
    111     |> concat(string("'"))
    112 
    113   atom_name =
    114     choice([
    115       simple_atom_name,
    116       quoted_atom_name
    117     ])
    118 
    119   atom = token(atom_name, :string_symbol)
    120 
    121   namespace =
    122     token(atom_name, :name_class)
    123     |> concat(token(":", :punctuation))
    124 
    125   function =
    126     atom_name
    127     |> lexeme()
    128     |> token(:name_function)
    129     |> concat(optional(whitespace))
    130     |> concat(token("(", :punctuation))
    131 
    132   # Can also be a function name
    133   variable =
    134     variable_name
    135     # Check if you need to use the lexeme parser
    136     # (i.e. if you need the token value to be a string)
    137     # If not, just delete the lexeme parser
    138     |> lexeme()
    139     |> token(:name)
    140 
    141   macro_name = choice([variable_name, atom_name])
    142 
    143   macro =
    144     string("?")
    145     |> concat(macro_name)
    146     |> token(:name_constant)
    147 
    148   label =
    149     string("#")
    150     |> concat(atom_name)
    151     |> optional(string(".") |> concat(atom_name))
    152     |> token(:name_label)
    153 
    154   character =
    155     string("$")
    156     |> choice([
    157       escape,
    158       string("\\") |> ascii_char([?\s, ?%]),
    159       ascii_char(not: ?\\)
    160     ])
    161     |> token(:string_char)
    162 
    163   string_interpol =
    164     string("~")
    165     |> optional(ascii_string([?0..?9, ?., ?*], min: 1))
    166     |> ascii_char(to_charlist("~#+BPWXb-ginpswx"))
    167     |> token(:string_interpol)
    168 
    169   escape_double_quote = string(~s/\\"/)
    170   erlang_string = string_like(~s/"/, ~s/"/, [escape_double_quote, string_interpol], :string)
    171 
    172   # Combinators that highlight expressions surrounded by a pair of delimiters.
    173   punctuation =
    174     word_from_list([","] ++ ~w[\[ \] : _ @ \" . \#{ { } ( ) | ; => := << >> || -> \#], :punctuation)
    175 
    176   tuple = many_surrounded_by(parsec(:root_element), "{", "}")
    177 
    178   syntax_operators =
    179     word_from_list(~W[+ - +? ++ = == -- * / < > /= =:= =/= =< >= ==? <- ! ? ?!], :operator)
    180 
    181   record =
    182     token(string("#"), :operator)
    183     |> concat(atom)
    184     |> choice([
    185       token("{", :punctuation),
    186       token(".", :punctuation)
    187     ])
    188 
    189   # We need to match on the new line here as to not tokenize a function call as a module attribute.
    190   # Without the newline matching, the expression `a(X) - b(Y)` would tokenize
    191   # `b(Y)` as a module attribute definition instead of a function call.
    192   module_attribute =
    193     token("\n", :whitespace)
    194     |> optional(whitespace)
    195     |> concat(token("-", :punctuation))
    196     |> optional(whitespace)
    197     |> concat(atom_name |> token(:name_attribute))
    198     |> optional(whitespace)
    199     |> optional(token("(", :punctuation))
    200 
    201   function_arity =
    202     atom
    203     |> concat(token("/", :punctuation))
    204     |> concat(number_integer)
    205 
    206   # Tag the tokens with the language name.
    207   # This makes it easier to postprocess files with multiple languages.
    208   @doc false
    209   def __as_erlang_language__({ttype, meta, value}) do
    210     {ttype, Map.put(meta, :language, :erlang), value}
    211   end
    212 
    213   root_element_combinator =
    214     choice([
    215       module_attribute,
    216       hashbang,
    217       whitespace,
    218       comment,
    219       erlang_string,
    220       record,
    221       punctuation,
    222       # `tuple` might be unnecessary
    223       tuple,
    224       syntax_operators,
    225       # Numbers
    226       number_integer_in_weird_base,
    227       number_float,
    228       number_integer,
    229       # Variables
    230       variable,
    231       namespace,
    232       function_arity,
    233       function,
    234       atom,
    235       macro,
    236       character,
    237       label,
    238       # If we can't parse any of the above, we highlight the next character as an error
    239       # and proceed from there.
    240       # A lexer should always consume any string given as input.
    241       any_char
    242     ])
    243 
    244   ##############################################################################
    245   # Semi-public API: these two functions can be used by someone who wants to
    246   # embed this lexer into another lexer, but other than that, they are not
    247   # meant to be used by end-users
    248   ##############################################################################
    249 
    250   @inline Application.get_env(:makeup_erlang, :inline, false)
    251 
    252   @impl Makeup.Lexer
    253   defparsec(
    254     :root_element,
    255     root_element_combinator |> map({__MODULE__, :__as_erlang_language__, []}),
    256     inline: @inline
    257   )
    258 
    259   @impl Makeup.Lexer
    260   defparsec(
    261     :root,
    262     repeat(parsec(:root_element)),
    263     inline: @inline
    264   )
    265 
    266   ###################################################################
    267   # Step #2: postprocess the list of tokens
    268   ###################################################################
    269 
    270   @keywords ~W[after begin case catch cond end fun if let of query receive try when]
    271 
    272   @builtins ~W[
    273     abs append_element apply atom_to_list binary_to_list bitstring_to_list
    274     binary_to_term bit_size bump_reductions byte_size cancel_timer
    275     check_process_code delete_module demonitor disconnect_node display
    276     element erase exit float float_to_list fun_info fun_to_list
    277     function_exported garbage_collect get get_keys group_leader hash
    278     hd integer_to_list iolist_to_binary iolist_size is_atom is_binary
    279     is_bitstring is_boolean is_builtin is_float is_function is_integer
    280     is_list is_number is_pid is_port is_process_alive is_record is_reference
    281     is_tuple length link list_to_atom list_to_binary list_to_bitstring
    282     list_to_existing_atom list_to_float list_to_integer list_to_pid
    283     list_to_tuple load_module localtime_to_universaltime make_tuple
    284     md5 md5_final md5_update memory module_loaded monitor monitor_node
    285     node nodes open_port phash phash2 pid_to_list port_close port_command
    286     port_connect port_control port_call port_info port_to_list
    287     process_display process_flag process_info purge_module put read_timer
    288     ref_to_list register resume_processround send send_after send_nosuspend
    289     set_cookie setelement size spawn spawn_link spawn_monitor spawn_opt
    290     split_binary start_timer statistics suspend_process system_flag
    291     system_info system_monitor system_profile term_to_binary tl trace
    292     trace_delivered trace_info trace_pattern trunc tuple_size tuple_to_list
    293     universaltime_to_localtime unlink unregister whereis
    294   ]
    295 
    296   @word_operators ~W[and andalso band bnot bor bsl bsr bxor div not or orelse rem xor]
    297 
    298   defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @keywords,
    299     do: [{:keyword, meta, value} | postprocess_helper(tokens)]
    300 
    301   defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @builtins,
    302     do: [{:name_builtin, meta, value} | postprocess_helper(tokens)]
    303 
    304   defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @word_operators,
    305     do: [{:operator_word, meta, value} | postprocess_helper(tokens)]
    306 
    307   defp postprocess_helper([token | tokens]), do: [token | postprocess_helper(tokens)]
    308 
    309   defp postprocess_helper([]), do: []
    310 
    311   # By default, return the list of tokens unchanged
    312   @impl Makeup.Lexer
    313   def postprocess(tokens, _opts \\ []), do: postprocess_helper(tokens)
    314 
    315   #######################################################################
    316   # Step #3: highlight matching delimiters
    317   # By default, this includes delimiters that are used in many languages,
    318   # but feel free to delete these or add more.
    319   #######################################################################
    320 
    321   @impl Makeup.Lexer
    322   defgroupmatcher(:match_groups,
    323     parentheses: [
    324       open: [[{:punctuation, %{language: :erlang}, "("}]],
    325       close: [[{:punctuation, %{language: :erlang}, ")"}]]
    326     ],
    327     list: [
    328       open: [
    329         [{:punctuation, %{language: :erlang}, "["}]
    330       ],
    331       close: [
    332         [{:punctuation, %{language: :erlang}, "]"}]
    333       ]
    334     ],
    335     tuple: [
    336       open: [
    337         [{:punctuation, %{language: :erlang}, "{"}]
    338       ],
    339       close: [
    340         [{:punctuation, %{language: :erlang}, "}"}]
    341       ]
    342     ],
    343     map: [
    344       open: [
    345         [{:punctuation, %{language: :erlang}, "\#{"}]
    346       ],
    347       close: [
    348         [{:punctuation, %{language: :erlang}, "}"}]
    349       ]
    350     ]
    351   )
    352 
    353   defp remove_initial_newline([{ttype, meta, text} | tokens]) do
    354     case to_string(text) do
    355       "\n" -> tokens
    356       "\n" <> rest -> [{ttype, meta, rest} | tokens]
    357     end
    358   end
    359 
    360   # Finally, the public API for the lexer
    361   @impl Makeup.Lexer
    362   def lex(text, opts \\ []) do
    363     group_prefix = Keyword.get(opts, :group_prefix, random_prefix(10))
    364     {:ok, tokens, "", _, _, _} = root("\n" <> text)
    365 
    366     tokens
    367     |> remove_initial_newline()
    368     |> postprocess()
    369     |> match_groups(group_prefix)
    370   end
    371 end