erlang_lexer.ex (11087B)
1 defmodule Makeup.Lexers.ErlangLexer do 2 @moduledoc """ 3 A `Makeup` lexer for the `Erlang` language. 4 """ 5 6 @behaviour Makeup.Lexer 7 8 import NimbleParsec 9 import Makeup.Lexer.Combinators 10 import Makeup.Lexer.Groups 11 12 ################################################################### 13 # Step #1: tokenize the input (into a list of tokens) 14 ################################################################### 15 16 whitespace = ascii_string([?\s, ?\f, ?\n], min: 1) |> token(:whitespace) 17 18 # This is the combinator that ensures that the lexer will never reject a file 19 # because of invalid input syntax 20 any_char = utf8_char([]) |> token(:error) 21 22 comment = 23 ascii_char([?%]) 24 |> optional(utf8_string([not: ?\n], min: 1)) 25 |> token(:comment_single) 26 27 hashbang = 28 string("\n#!") 29 |> utf8_string([not: ?\n], min: 1) 30 |> string("\n") 31 |> token(:comment_hashbang) 32 33 escape_octal = ascii_string([?0..?7], min: 1, max: 3) 34 35 escape_char = ascii_char([?\b, ?\d, ?\e, ?\f, ?\n, ?\r, ?\s, ?\t, ?\v, ?\', ?\", ?\\]) 36 37 escape_hex = 38 choice([ 39 string("x") |> ascii_string([?0..?9, ?a..?f, ?A..?F], 2), 40 string("x{") |> ascii_string([?0..?9, ?a..?f, ?A..?F], min: 1) |> string("}") 41 ]) 42 43 escape_ctrl = string("^") |> ascii_char([?a..?z, ?A..?Z]) 44 45 escape = 46 choice([ 47 escape_char, 48 escape_octal, 49 escape_hex, 50 escape_ctrl 51 ]) 52 53 numeric_base = 54 choice([ 55 ascii_char([?1..?2]) |> ascii_char([?0..?9]), 56 string("3") |> ascii_char([?0..?6]), 57 ascii_char([?2..?9]) 58 ]) 59 60 # Numbers 61 digits = ascii_string([?0..?9], min: 1) 62 63 number_integer = 64 optional(ascii_char([?+, ?-])) 65 |> concat(digits) 66 |> token(:number_integer) 67 68 number_integer_in_weird_base = 69 optional(ascii_char([?+, ?-])) 70 |> concat(numeric_base) 71 |> string("#") 72 |> ascii_string([?0..?9, ?a..?z, ?A..?Z], min: 1) 73 |> token(:number_integer) 74 75 # Floating point numbers 76 float_scientific_notation_part = 77 ascii_string([?e, ?E], 1) 78 |> optional(string("-")) 79 |> concat(digits) 80 81 number_float = 82 optional(ascii_char([?+, ?-])) 83 |> concat(digits) 84 |> string(".") 85 |> concat(digits) 86 |> optional(float_scientific_notation_part) 87 |> token(:number_float) 88 89 variable_name = 90 ascii_string([?A..?Z, ?_], 1) 91 |> optional(ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1)) 92 93 simple_atom_name = 94 ascii_string([?a..?z], 1) 95 |> optional(ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1)) 96 |> reduce({Enum, :join, []}) 97 98 single_quote_escape = string("\\'") 99 100 quoted_atom_name_middle = 101 lookahead_not(string("'")) 102 |> choice([ 103 single_quote_escape, 104 utf8_string([not: ?\n, not: ?', not: ?\\], min: 1), 105 escape 106 ]) 107 108 quoted_atom_name = 109 string("'") 110 |> repeat(quoted_atom_name_middle) 111 |> concat(string("'")) 112 113 atom_name = 114 choice([ 115 simple_atom_name, 116 quoted_atom_name 117 ]) 118 119 atom = token(atom_name, :string_symbol) 120 121 namespace = 122 token(atom_name, :name_class) 123 |> concat(token(":", :punctuation)) 124 125 function = 126 atom_name 127 |> lexeme() 128 |> token(:name_function) 129 |> concat(optional(whitespace)) 130 |> concat(token("(", :punctuation)) 131 132 # Can also be a function name 133 variable = 134 variable_name 135 # Check if you need to use the lexeme parser 136 # (i.e. if you need the token value to be a string) 137 # If not, just delete the lexeme parser 138 |> lexeme() 139 |> token(:name) 140 141 macro_name = choice([variable_name, atom_name]) 142 143 macro = 144 string("?") 145 |> concat(macro_name) 146 |> token(:name_constant) 147 148 label = 149 string("#") 150 |> concat(atom_name) 151 |> optional(string(".") |> concat(atom_name)) 152 |> token(:name_label) 153 154 character = 155 string("$") 156 |> choice([ 157 escape, 158 string("\\") |> ascii_char([?\s, ?%]), 159 ascii_char(not: ?\\) 160 ]) 161 |> token(:string_char) 162 163 string_interpol = 164 string("~") 165 |> optional(ascii_string([?0..?9, ?., ?*], min: 1)) 166 |> ascii_char(to_charlist("~#+BPWXb-ginpswx")) 167 |> token(:string_interpol) 168 169 escape_double_quote = string(~s/\\"/) 170 erlang_string = string_like(~s/"/, ~s/"/, [escape_double_quote, string_interpol], :string) 171 172 # Combinators that highlight expressions surrounded by a pair of delimiters. 173 punctuation = 174 word_from_list([","] ++ ~w[\[ \] : _ @ \" . \#{ { } ( ) | ; => := << >> || -> \#], :punctuation) 175 176 tuple = many_surrounded_by(parsec(:root_element), "{", "}") 177 178 syntax_operators = 179 word_from_list(~W[+ - +? ++ = == -- * / < > /= =:= =/= =< >= ==? <- ! ? ?!], :operator) 180 181 record = 182 token(string("#"), :operator) 183 |> concat(atom) 184 |> choice([ 185 token("{", :punctuation), 186 token(".", :punctuation) 187 ]) 188 189 # We need to match on the new line here as to not tokenize a function call as a module attribute. 190 # Without the newline matching, the expression `a(X) - b(Y)` would tokenize 191 # `b(Y)` as a module attribute definition instead of a function call. 192 module_attribute = 193 token("\n", :whitespace) 194 |> optional(whitespace) 195 |> concat(token("-", :punctuation)) 196 |> optional(whitespace) 197 |> concat(atom_name |> token(:name_attribute)) 198 |> optional(whitespace) 199 |> optional(token("(", :punctuation)) 200 201 function_arity = 202 atom 203 |> concat(token("/", :punctuation)) 204 |> concat(number_integer) 205 206 # Tag the tokens with the language name. 207 # This makes it easier to postprocess files with multiple languages. 208 @doc false 209 def __as_erlang_language__({ttype, meta, value}) do 210 {ttype, Map.put(meta, :language, :erlang), value} 211 end 212 213 root_element_combinator = 214 choice([ 215 module_attribute, 216 hashbang, 217 whitespace, 218 comment, 219 erlang_string, 220 record, 221 punctuation, 222 # `tuple` might be unnecessary 223 tuple, 224 syntax_operators, 225 # Numbers 226 number_integer_in_weird_base, 227 number_float, 228 number_integer, 229 # Variables 230 variable, 231 namespace, 232 function_arity, 233 function, 234 atom, 235 macro, 236 character, 237 label, 238 # If we can't parse any of the above, we highlight the next character as an error 239 # and proceed from there. 240 # A lexer should always consume any string given as input. 241 any_char 242 ]) 243 244 ############################################################################## 245 # Semi-public API: these two functions can be used by someone who wants to 246 # embed this lexer into another lexer, but other than that, they are not 247 # meant to be used by end-users 248 ############################################################################## 249 250 @inline Application.get_env(:makeup_erlang, :inline, false) 251 252 @impl Makeup.Lexer 253 defparsec( 254 :root_element, 255 root_element_combinator |> map({__MODULE__, :__as_erlang_language__, []}), 256 inline: @inline 257 ) 258 259 @impl Makeup.Lexer 260 defparsec( 261 :root, 262 repeat(parsec(:root_element)), 263 inline: @inline 264 ) 265 266 ################################################################### 267 # Step #2: postprocess the list of tokens 268 ################################################################### 269 270 @keywords ~W[after begin case catch cond end fun if let of query receive try when] 271 272 @builtins ~W[ 273 abs append_element apply atom_to_list binary_to_list bitstring_to_list 274 binary_to_term bit_size bump_reductions byte_size cancel_timer 275 check_process_code delete_module demonitor disconnect_node display 276 element erase exit float float_to_list fun_info fun_to_list 277 function_exported garbage_collect get get_keys group_leader hash 278 hd integer_to_list iolist_to_binary iolist_size is_atom is_binary 279 is_bitstring is_boolean is_builtin is_float is_function is_integer 280 is_list is_number is_pid is_port is_process_alive is_record is_reference 281 is_tuple length link list_to_atom list_to_binary list_to_bitstring 282 list_to_existing_atom list_to_float list_to_integer list_to_pid 283 list_to_tuple load_module localtime_to_universaltime make_tuple 284 md5 md5_final md5_update memory module_loaded monitor monitor_node 285 node nodes open_port phash phash2 pid_to_list port_close port_command 286 port_connect port_control port_call port_info port_to_list 287 process_display process_flag process_info purge_module put read_timer 288 ref_to_list register resume_processround send send_after send_nosuspend 289 set_cookie setelement size spawn spawn_link spawn_monitor spawn_opt 290 split_binary start_timer statistics suspend_process system_flag 291 system_info system_monitor system_profile term_to_binary tl trace 292 trace_delivered trace_info trace_pattern trunc tuple_size tuple_to_list 293 universaltime_to_localtime unlink unregister whereis 294 ] 295 296 @word_operators ~W[and andalso band bnot bor bsl bsr bxor div not or orelse rem xor] 297 298 defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @keywords, 299 do: [{:keyword, meta, value} | postprocess_helper(tokens)] 300 301 defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @builtins, 302 do: [{:name_builtin, meta, value} | postprocess_helper(tokens)] 303 304 defp postprocess_helper([{:string_symbol, meta, value} | tokens]) when value in @word_operators, 305 do: [{:operator_word, meta, value} | postprocess_helper(tokens)] 306 307 defp postprocess_helper([token | tokens]), do: [token | postprocess_helper(tokens)] 308 309 defp postprocess_helper([]), do: [] 310 311 # By default, return the list of tokens unchanged 312 @impl Makeup.Lexer 313 def postprocess(tokens, _opts \\ []), do: postprocess_helper(tokens) 314 315 ####################################################################### 316 # Step #3: highlight matching delimiters 317 # By default, this includes delimiters that are used in many languages, 318 # but feel free to delete these or add more. 319 ####################################################################### 320 321 @impl Makeup.Lexer 322 defgroupmatcher(:match_groups, 323 parentheses: [ 324 open: [[{:punctuation, %{language: :erlang}, "("}]], 325 close: [[{:punctuation, %{language: :erlang}, ")"}]] 326 ], 327 list: [ 328 open: [ 329 [{:punctuation, %{language: :erlang}, "["}] 330 ], 331 close: [ 332 [{:punctuation, %{language: :erlang}, "]"}] 333 ] 334 ], 335 tuple: [ 336 open: [ 337 [{:punctuation, %{language: :erlang}, "{"}] 338 ], 339 close: [ 340 [{:punctuation, %{language: :erlang}, "}"}] 341 ] 342 ], 343 map: [ 344 open: [ 345 [{:punctuation, %{language: :erlang}, "\#{"}] 346 ], 347 close: [ 348 [{:punctuation, %{language: :erlang}, "}"}] 349 ] 350 ] 351 ) 352 353 defp remove_initial_newline([{ttype, meta, text} | tokens]) do 354 case to_string(text) do 355 "\n" -> tokens 356 "\n" <> rest -> [{ttype, meta, rest} | tokens] 357 end 358 end 359 360 # Finally, the public API for the lexer 361 @impl Makeup.Lexer 362 def lex(text, opts \\ []) do 363 group_prefix = Keyword.get(opts, :group_prefix, random_prefix(10)) 364 {:ok, tokens, "", _, _, _} = root("\n" <> text) 365 366 tokens 367 |> remove_initial_newline() 368 |> postprocess() 369 |> match_groups(group_prefix) 370 end 371 end