elixir_lexer.ex (18779B)
1 defmodule Makeup.Lexers.ElixirLexer do 2 @moduledoc """ 3 A `Makeup` lexer for the Elixir language. 4 """ 5 6 import NimbleParsec 7 import Makeup.Lexer.Combinators 8 import Makeup.Lexer.Groups 9 import Makeup.Lexers.ElixirLexer.Helper 10 11 @behaviour Makeup.Lexer 12 13 ################################################################### 14 # Step #1: tokenize the input (into a list of tokens) 15 ################################################################### 16 # We will often compose combinators into larger combinators. 17 # Sometimes, the smaller combinator is useful on its own as a token, and sometimes it isn't. 18 # We'll adopt the following "convention": 19 # 20 # 1. A combinator that ends with `_name` returns a string 21 # 2. Other combinators will *usually* return a token 22 # 23 # Why this convention? Tokens can't be composed further, while raw strings can. 24 # This way, we immediately know which of the combinators we can compose. 25 # TODO: check we're following this convention 26 # NOTE: if Elixir had a good static type system it would help us do the right thing here. 27 28 whitespace = ascii_string([?\r, ?\s, ?\n, ?\f], min: 1) |> token(:whitespace) 29 30 newlines = 31 choice([string("\r\n"), string("\n")]) 32 |> optional(ascii_string([?\s, ?\n, ?\f, ?\r], min: 1)) 33 |> token(:whitespace) 34 35 any_char = utf8_char([]) |> token(:error) 36 37 # Numbers 38 digits = ascii_string([?0..?9], min: 1) 39 bin_digits = ascii_string([?0..?1], min: 1) 40 hex_digits = ascii_string([?0..?9, ?a..?f, ?A..?F], min: 1) 41 oct_digits = ascii_string([?0..?7], min: 1) 42 # Digits in an integer may be separated by underscores 43 number_bin_part = with_optional_separator(bin_digits, "_") 44 number_oct_part = with_optional_separator(oct_digits, "_") 45 number_hex_part = with_optional_separator(hex_digits, "_") 46 integer = with_optional_separator(digits, "_") 47 48 # Tokens for the lexer 49 number_bin = string("0b") |> concat(number_bin_part) |> token(:number_bin) 50 number_oct = string("0o") |> concat(number_oct_part) |> token(:number_oct) 51 number_hex = string("0x") |> concat(number_hex_part) |> token(:number_hex) 52 # Base 10 53 number_integer = token(integer, :number_integer) 54 55 # Floating point numbers 56 float_scientific_notation_part = 57 ascii_string([?e, ?E], 1) 58 |> optional(string("-")) 59 |> concat(integer) 60 61 number_float = 62 integer 63 |> string(".") 64 |> concat(integer) 65 |> optional(float_scientific_notation_part) 66 |> token(:number_float) 67 68 variable_name = 69 parsec({Makeup.Lexers.ElixirLexer.Variables, :variable_start_chars}) 70 |> repeat(parsec({Makeup.Lexers.ElixirLexer.Variables, :variable_continue_chars})) 71 |> optional(utf8_char([??, ?!])) 72 73 variable = 74 variable_name 75 |> lexeme 76 |> token(:name) 77 78 # TODO: as above 79 alias_part = 80 ascii_string([?A..?Z], 1) 81 |> optional(ascii_string([?a..?z, ?_, ?0..?9, ?A..?Z], min: 1)) 82 83 module_name = alias_part |> concat(repeat(string(".") |> concat(alias_part))) 84 85 module = token(module_name, :name_class) 86 87 operator_name = word_from_list(~W( 88 <<< >>> ||| &&& ^^^ ~~~ === !== ~>> <~> |~> <|> 89 == != <= >= && || \\ <> ++ -- |> =~ -> <- ~> <~ :: .. 90 = < > + - * / | . ^ & ! // 91 )) 92 93 operator = token(operator_name, :operator) 94 95 # The lexer needs to test these before `>>` and `<<` 96 bitshifts = 97 word_from_list(~W(<<< >>>)) 98 |> token(:operator) 99 100 special_atom_name = word_from_list(~W(... <<>> %{} % {} ..//)) 101 102 triple_dot = token("...", :name) 103 104 map_arrow = token("=>", :punctuation) 105 106 anon_function_arguments = 107 string("&") 108 |> concat(digits) 109 |> token(:name_entity) 110 111 normal_char = 112 string("?") 113 |> utf8_string([], 1) 114 |> token(:string_char) 115 116 escape_char = 117 string("?\\") 118 |> utf8_string([], 1) 119 |> token(:string_char) 120 121 special_atom = 122 string(":") 123 |> concat(special_atom_name) 124 |> token(:string_symbol) 125 126 attribute = 127 string("@") 128 |> concat(variable_name) 129 |> token(:name_attribute) 130 131 punctuation = 132 word_from_list( 133 [":", ";", ",", ".", "%"], 134 :punctuation 135 ) 136 137 # Combinators that highlight elixir expressions surrounded by a pair of delimiters. 138 # Most of the time, the delimiters can be described by symple characters, but the 139 # combinator that parses a struct is more complex 140 interpolation = many_surrounded_by(parsec(:root_element), "\#{", "}", :string_interpol) 141 tuple = many_surrounded_by(parsec(:root_element), "{", "}") 142 143 binary_inside_opaque_struct = many_surrounded_by(parsec(:root_element), "<<", ">>") 144 # Only for the IEx lexer (it's not valid Elixir code): 145 opaque_struct = 146 many_surrounded_by( 147 choice([ 148 binary_inside_opaque_struct, 149 parsec(:root_element) 150 ]), 151 token("#", :punctuation) |> concat(module) |> concat(token("<", :punctuation)), 152 token(">", :punctuation) 153 ) 154 155 delimiters_punctuation = 156 word_from_list( 157 ~W( ( \) [ ] << >>), 158 :punctuation 159 ) 160 161 map = many_surrounded_by(parsec(:root_element), "%{", "}") 162 163 delimiter_pairs = [ 164 delimiters_punctuation, 165 tuple, 166 map 167 ] 168 169 normal_atom_name = 170 parsec({Makeup.Lexers.ElixirLexer.Atoms, :atom_start_chars}) 171 |> repeat(parsec({Makeup.Lexers.ElixirLexer.Atoms, :atom_continue_chars})) 172 |> optional(utf8_char([??, ?!])) 173 174 normal_atom = 175 string(":") 176 |> choice([operator_name, normal_atom_name]) 177 |> token(:string_symbol) 178 179 unicode_char_in_string = 180 string("\\u") 181 |> ascii_string([?0..?9, ?a..?f, ?A..?F], 4) 182 |> token(:string_escape) 183 184 escaped_char = 185 string("\\") 186 |> utf8_string([], 1) 187 |> token(:string_escape) 188 189 # We must support iex prompts inside a string, sigil or heredoc. 190 # For example: 191 # 192 # iex(1)> a = """ 193 # ...(1)> line1 194 # ...(1)> line2 195 # ...(1)> """ 196 # 197 # Inside the string we don't expect the `iex>` prompt, only the `...>` prompt. 198 iex_prompt_inside_string = 199 string("\n...") 200 |> optional(string("(") |> concat(digits) |> string(")")) 201 |> string(">") 202 |> optional(string(" ")) 203 |> token(:generic_prompt, %{selectable: false}) 204 205 combinators_inside_string = [ 206 unicode_char_in_string, 207 escaped_char, 208 interpolation, 209 iex_prompt_inside_string 210 ] 211 212 string_atom = 213 choice([ 214 string_like(":\"", "\"", combinators_inside_string, :string_symbol), 215 string_like(":'", "'", combinators_inside_string, :string_symbol) 216 ]) 217 218 atom = 219 choice([ 220 special_atom, 221 normal_atom, 222 string_atom 223 ]) 224 225 string_keyword = 226 choice([ 227 string_like("\"", "\"", combinators_inside_string, :string_symbol), 228 string_like("'", "'", combinators_inside_string, :string_symbol) 229 ]) 230 |> concat(token(string(":"), :punctuation)) 231 232 normal_keyword = 233 choice([operator_name, normal_atom_name]) 234 |> token(:string_symbol) 235 |> concat(token(string(":"), :punctuation)) 236 237 keyword = 238 choice([ 239 normal_keyword, 240 string_keyword 241 ]) 242 |> lookahead(whitespace) 243 244 sigil_delimiters = [ 245 {~S["""], ~S["""]}, 246 {"'''", "'''"}, 247 {"\"", "\""}, 248 {"'", "'"}, 249 {"/", "/"}, 250 {"{", "}"}, 251 {"[", "]"}, 252 {"(", ")"}, 253 {"<", ">"}, 254 {"|", "|"} 255 ] 256 257 sigils_interpol = 258 for {ldelim, rdelim} <- sigil_delimiters do 259 sigil(ldelim, rdelim, [?a..?z], combinators_inside_string) 260 end 261 262 sigils_no_interpol = 263 for {ldelim, rdelim} <- sigil_delimiters do 264 sigil(ldelim, rdelim, [?A..?Z], [escape_delim(rdelim), iex_prompt_inside_string]) 265 end 266 267 all_sigils = sigils_interpol ++ sigils_no_interpol 268 269 double_quoted_string_interpol = string_like("\"", "\"", combinators_inside_string, :string) 270 single_quoted_string_interpol = string_like("'", "'", combinators_inside_string, :string_char) 271 double_quoted_heredocs = string_like(~S["""], ~S["""], combinators_inside_string, :string) 272 single_quoted_heredocs = string_like("'''", "'''", combinators_inside_string, :string_char) 273 274 # `#PID<123.456.789>` 275 pid = 276 token("#", :punctuation) 277 |> concat(token("PID", :name_class)) 278 |> concat(token("<", :punctuation)) 279 |> concat(number_integer) 280 |> concat(token(".", :operator)) 281 |> concat(number_integer) 282 |> concat(token(".", :operator)) 283 |> concat(number_integer) 284 |> concat(token(">", :punctuation)) 285 286 line = repeat(lookahead_not(ascii_char([?\n])) |> utf8_string([], 1)) 287 288 inline_comment = 289 string("#") 290 |> concat(line) 291 |> token(:comment_single) 292 293 # An IEx prompt is supported in the normal Elixir lexer because false positives 294 # would be extremely rare 295 iex_prompt = 296 choice([string("iex"), string("...")]) 297 |> optional(string("(") |> concat(digits) |> string(")")) 298 |> string(">") 299 |> optional(string(" ")) 300 |> token(:generic_prompt, %{selectable: false}) 301 302 stacktrace = 303 string("** (") 304 # The rest of the line is part of the traceback 305 |> concat(line) 306 # All lines indented by 4 spaces are part of the traceback 307 |> repeat(string("\n ") |> concat(line)) 308 |> token(:generic_traceback) 309 310 root_element_combinator = 311 choice( 312 [ 313 # START of IEx-specific tokens 314 # IEx prompt must come before names 315 newlines |> choice([iex_prompt, stacktrace]), 316 # a PID is a special kind of opaque struct 317 pid, 318 # Opaque struct (must come before inline comments) 319 opaque_struct, 320 # END of IEx-specific tokens 321 whitespace, 322 # Comments 323 inline_comment, 324 # Syntax sugar for keyword lists (must come before variables and strings) 325 keyword, 326 # Strings and sigils 327 double_quoted_heredocs, 328 single_quoted_heredocs, 329 double_quoted_string_interpol, 330 single_quoted_string_interpol 331 ] ++ 332 all_sigils ++ 333 [ 334 # Chars 335 escape_char, 336 normal_char, 337 # Atoms 338 atom, 339 # Module attributes 340 attribute, 341 # Anonymous function arguments (must come before the operators) 342 anon_function_arguments, 343 # Bitwise operators must match first 344 bitshifts 345 # Matching delimiters 346 ] ++ 347 delimiter_pairs ++ 348 [ 349 # Triple dot (must come before operators) 350 triple_dot, 351 # Map arrow (must come before operators) 352 map_arrow, 353 # Operators 354 operator, 355 # Numbers 356 number_bin, 357 number_oct, 358 number_hex, 359 # Floats must come before integers 360 number_float, 361 number_integer, 362 # Names 363 variable, 364 # Module names 365 module, 366 punctuation, 367 # If we can't parse any of the above, we highlight the next character as an error 368 # and proceed from there. 369 # A lexer should always consume any string given as input. 370 any_char 371 ] 372 ) 373 374 # By default, don't inline the lexers. 375 # Inlining them increases performance by ~20% 376 # at the cost of doubling the compilation times... 377 @inline false 378 379 @doc false 380 def __as_elixir_language__({ttype, meta, value}) do 381 {ttype, Map.put(meta, :language, :elixir), value} 382 end 383 384 # Semi-public API: these two functions can be used by someone who wants to 385 # embed an Elixir lexer into another lexer, but other than that, they are not 386 # meant to be used by end-users. 387 388 # @impl Makeup.Lexer 389 defparsec( 390 :root_element, 391 root_element_combinator |> map({__MODULE__, :__as_elixir_language__, []}), 392 inline: @inline, 393 export_combinator: true 394 ) 395 396 # @impl Makeup.Lexer 397 defparsec( 398 :root, 399 repeat(parsec(:root_element)), 400 inline: @inline, 401 export_combinator: true 402 ) 403 404 ################################################################### 405 # Step #2: postprocess the list of tokens 406 ################################################################### 407 408 @def_like ~W[def defp defmacro defmacrop defguard defguardp defn defnp] 409 @keyword_declaration @def_like ++ ~W[ 410 defmodule defprotocol defdelegate defexception defstruct defimpl] 411 @keyword ~W[ 412 fn do end after else rescue catch with 413 case cond for if unless try receive raise 414 quote unquote unquote_splicing throw super] 415 @operator_word ~W[not and or when in] 416 @keyword_namespace ~W[import require use alias] 417 @name_constant ~W[nil true false] 418 @name_builtin_pseudo ~W[_ __MODULE__ __DIR__ __ENV__ __CALLER__] 419 420 # The `postprocess/1` function will require a major redesign when we decide to support 421 # custom `def`-like keywords supplied by the user. 422 defp postprocess_helper([]), do: [] 423 424 # In an expression such as: 425 # 426 # def a + b, do: nil 427 # 428 # the variable_name `a` is a parameter for the `+/2` operator. 429 # It should not be highlighted as a function name. 430 # for that, we must scan a little further (one additional token) for the operator. 431 defp postprocess_helper([ 432 {:name, attrs1, text1}, 433 {:whitespace, _, _} = ws1, 434 {:name, _, text2} = param, 435 {:whitespace, _, _} = ws2, 436 {:operator, _, _} = op 437 | tokens 438 ]) 439 when text1 in @def_like and text2 != "unquote" do 440 [{:keyword_declaration, attrs1, text1}, ws1, param, ws2, op | postprocess_helper(tokens)] 441 end 442 443 # The same as above without whitespace 444 defp postprocess_helper([ 445 {:name, attrs1, text1}, 446 {:whitespace, _, _} = ws, 447 {:name, _, text2} = param, 448 {:operator, _, _} = op 449 | tokens 450 ]) 451 when text1 in @def_like and text2 != "unquote" do 452 [{:keyword_declaration, attrs1, text1}, ws, param, op | postprocess_helper(tokens)] 453 end 454 455 # If we're matching this branch, we already know that this is not an operator definition. 456 # We can highlight the variable_name after the function name as a function name. 457 defp postprocess_helper([ 458 {:name, attrs1, text1}, 459 {:whitespace, _, _} = ws, 460 {:name, attrs2, text2} | tokens 461 ]) 462 when text1 in @def_like and text2 != "unquote" do 463 [ 464 {:keyword_declaration, attrs1, text1}, 465 ws, 466 {:name_function, attrs2, text2} | postprocess_helper(tokens) 467 ] 468 end 469 470 # When calling functions from an erlang module, highlight the atom as a module. 471 # 472 # :crypto.strong_rand_bytes(4) 473 defp postprocess_helper([ 474 {:string_symbol, attrs1, [":" | _] = module}, 475 {:operator, _, "."} = op, 476 {:name, _, _} = text 477 | tokens 478 ]) do 479 [{:name_class, attrs1, module}, op, text | postprocess_helper(tokens)] 480 end 481 482 defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @keyword, 483 do: [{:keyword, attrs, text} | postprocess_helper(tokens)] 484 485 defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @keyword_declaration, 486 do: [{:keyword_declaration, attrs, text} | postprocess_helper(tokens)] 487 488 defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @operator_word, 489 do: [{:operator_word, attrs, text} | postprocess_helper(tokens)] 490 491 defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @keyword_namespace, 492 do: [{:keyword_namespace, attrs, text} | postprocess_helper(tokens)] 493 494 defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @name_constant, 495 do: [{:name_constant, attrs, text} | postprocess_helper(tokens)] 496 497 defp postprocess_helper([{:name, attrs, text} | tokens]) when text in @name_builtin_pseudo, 498 do: [{:name_builtin_pseudo, attrs, text} | postprocess_helper(tokens)] 499 500 # Unused variables 501 defp postprocess_helper([{:name, attrs, "_" <> _name = text} | tokens]), 502 do: [{:comment, attrs, text} | postprocess_helper(tokens)] 503 504 # Otherwise, don't do anything with the current token and go to the next token. 505 defp postprocess_helper([token | tokens]), do: [token | postprocess_helper(tokens)] 506 507 # Public API 508 @impl Makeup.Lexer 509 def postprocess(tokens, _opts \\ []), do: postprocess_helper(tokens) 510 511 ################################################################### 512 # Step #3: highlight matching delimiters 513 ################################################################### 514 515 @impl Makeup.Lexer 516 defgroupmatcher(:match_groups, 517 do_end: [ 518 open: [ 519 [{:keyword, %{language: :elixir}, "do"}] 520 ], 521 middle: [ 522 [{:keyword, %{language: :elixir}, "else"}], 523 [{:keyword, %{language: :elixir}, "catch"}], 524 [{:keyword, %{language: :elixir}, "rescue"}], 525 [{:keyword, %{language: :elixir}, "after"}] 526 ], 527 close: [ 528 [{:keyword, %{language: :elixir}, "end"}] 529 ] 530 ], 531 fn_end: [ 532 open: [[{:keyword, %{language: :elixir}, "fn"}]], 533 close: [[{:keyword, %{language: :elixir}, "end"}]] 534 ], 535 parentheses: [ 536 open: [[{:punctuation, %{language: :elixir}, "("}]], 537 close: [[{:punctuation, %{language: :elixir}, ")"}]] 538 ], 539 list: [ 540 open: [ 541 [{:punctuation, %{language: :elixir}, "["}] 542 ], 543 close: [ 544 [{:punctuation, %{language: :elixir}, "]"}] 545 ] 546 ], 547 tuple: [ 548 open: [ 549 [{:punctuation, %{language: :elixir}, "{"}] 550 ], 551 close: [ 552 [{:punctuation, %{language: :elixir}, "}"}] 553 ] 554 ], 555 map: [ 556 open: [ 557 [{:punctuation, %{language: :elixir}, "%{"}] 558 ], 559 close: [ 560 [{:punctuation, %{language: :elixir}, "}"}] 561 ] 562 ], 563 struct: [ 564 open: [ 565 [ 566 {:punctuation, %{language: :elixir}, "%"}, 567 {:name_class, %{language: :elixir}, _}, 568 {:punctuation, %{language: :elixir}, "{"} 569 ] 570 ], 571 close: [ 572 [{:punctuation, %{language: :elixir}, "}"}] 573 ] 574 ], 575 opaque_struct: [ 576 open: [ 577 [ 578 {:punctuation, %{language: :elixir}, "#"}, 579 {:name_class, %{language: :elixir}, _}, 580 {:punctuation, %{language: :elixir}, "<"} 581 ] 582 ], 583 close: [ 584 [{:punctuation, %{language: :elixir}, ">"}] 585 ] 586 ], 587 binaries: [ 588 open: [ 589 [{:punctuation, %{language: :elixir}, "<<"}] 590 ], 591 close: [ 592 [{:punctuation, %{language: :elixir}, ">>"}] 593 ] 594 ], 595 interpolation: [ 596 open: [ 597 [{:string_interpol, %{language: :elixir}, "\#{"}] 598 ], 599 close: [ 600 [{:string_interpol, %{language: :elixir}, "}"}] 601 ] 602 ] 603 ) 604 605 defp remove_initial_newline([{ttype, meta, text} | tokens]) do 606 case to_string(text) do 607 "\n" -> tokens 608 "\n" <> rest -> [{ttype, meta, rest} | tokens] 609 end 610 end 611 612 # Finally, the public API for the lexer 613 @impl Makeup.Lexer 614 def lex(text, opts \\ []) do 615 group_prefix = Keyword.get(opts, :group_prefix, random_prefix(10)) 616 {:ok, tokens, "", _, _, _} = root("\n" <> text) 617 618 tokens 619 |> remove_initial_newline() 620 |> postprocess([]) 621 |> match_groups(group_prefix) 622 end 623 end