lexer.ex (2839B)
1 defmodule Makeup.Lexer do 2 @moduledoc """ 3 A lexer turns raw source code into a list of tokens. 4 """ 5 alias Makeup.Lexer.Types, as: T 6 alias Makeup.Lexer.Postprocess 7 8 @doc """ 9 Parses the smallest number of tokens that make sense. 10 It's a `parsec`. 11 """ 12 @callback root_element(String.t) :: T.parsec_result 13 14 @doc """ 15 Parses the given string into a `parsec` result that includes a list of tokens. 16 """ 17 @callback root(String.t) :: T.parsec_result 18 19 @doc """ 20 Post-processes a list of tokens before matching the contained groups. 21 """ 22 @callback postprocess([T.token()], list()) :: [T.token()] 23 24 @doc """ 25 Matches groups in a list of tokens. 26 """ 27 @callback match_groups([T.token()], String.t) :: [T.token()] 28 29 @doc """ 30 Lexes a string into a list of tokens 31 """ 32 @callback lex(String.t(), list()) :: [T.token()] 33 34 35 @doc """ 36 Merges the token values into the original string. 37 38 Inverts the output of a lexer. That is, if `lexer` is a lexer, then: 39 40 string |> lexer.lex() |> Makeup.Lexer.unlex() == string 41 42 This only works for a correctly implemented lexer, of course. 43 The above identity can be treated as a lexer invariant for newly implemented lexers. 44 """ 45 @spec unlex(list(T.token())) :: String.t() 46 def unlex(tokens) do 47 tokens 48 |> Enum.map(&Postprocess.token_value_to_binary/1) 49 |> Enum.map(fn {_tag, _meta, value} -> value end) 50 |> Enum.join() 51 end 52 53 @doc """ 54 Splits a list of tokens on newline characters (`\n`). 55 56 The result is a list of lists of tokens with no newlines. 57 """ 58 @spec split_into_lines(list(T.token())) :: list(list(T.token())) 59 def split_into_lines(tokens) do 60 {lines, last_line} = 61 Enum.reduce tokens, {[], []}, (fn token, {lines, line} -> 62 {ttype, meta, text} = Postprocess.token_value_to_binary(token) 63 case String.split(text, "\n") do 64 [_] -> {lines, [token | line]} 65 [part | parts] -> 66 first_line = [{ttype, meta, part} | line] |> :lists.reverse 67 68 all_but_last_line = 69 parts 70 |> Enum.slice(0..-2) 71 |> Enum.map(fn tok_text -> [{ttype, meta, tok_text}] end) 72 |> :lists.reverse 73 74 last_line = [{ttype, meta, Enum.at(parts, -1)}] 75 76 {all_but_last_line ++ [first_line | lines], last_line} 77 end 78 end) 79 80 :lists.reverse([last_line | lines]) 81 end 82 83 @doc """ 84 Merge adjacent tokens of the same type and with the same attributes. 85 86 Doing this will require iterating over the list of tokens again, 87 so only do this if you have a good reason. 88 """ 89 @spec merge(list(T.token())) :: list(T.token()) 90 def merge([{tag, meta, value1}, {tag, meta, value2} | rest]), 91 do: merge [{tag, meta, value1 <> value2} | rest] 92 def merge([token | rest]), 93 do: [token | merge(rest)] 94 def merge([]), 95 do: [] 96 end