combinators.ex (7288B)
1 defmodule Makeup.Lexer.Combinators do 2 @moduledoc """ 3 Common components useful in many lexers. 4 """ 5 import NimbleParsec 6 7 @doc """ 8 Wraps the given combinator into a token of the given `ttype`. 9 10 Instead of a combinator, the first argument can also be a string literal. 11 """ 12 def token(literal, token_type) when is_binary(literal) do 13 replace(string(literal), {token_type, %{}, literal}) 14 end 15 16 def token(combinator, token_type) do 17 combinator |> post_traverse({__MODULE__, :__token__, [token_type]}) 18 end 19 20 def token(literal, token_type, attrs) when is_binary(literal) and is_map(attrs) do 21 replace(string(literal), {token_type, attrs, literal}) 22 end 23 24 def token(combinator, token_type, attrs) when is_map(attrs) do 25 combinator |> post_traverse({__MODULE__, :__token__, [token_type, attrs]}) 26 end 27 28 @doc """ 29 Joins the result of the given combinator into a single string. 30 31 This is not usually necessary, but it can be useful if you want to match on the tokens. 32 It's easier to match on the token `{:keyword, %{}, "unquote"}` than on something like 33 `{:keyword, %{}, ["u", "nquote"]}`, even though both tokens will be treated the same way 34 by the formatter. 35 """ 36 def lexeme(combinator) do 37 combinator |> post_traverse({__MODULE__, :__lexeme__, []}) 38 end 39 40 @doc false 41 def __token__(rest, [arg], context, _line, _offset, token_type) do 42 {rest, [{token_type, %{}, arg}], context} 43 end 44 45 def __token__(rest, arg, context, _line, _offset, token_type) when is_binary(arg) do 46 {rest, [{token_type, %{}, arg}], context} 47 end 48 49 def __token__(rest, args, context, _line, _offset, token_type) do 50 {rest, [{token_type, %{}, args |> :lists.reverse()}], context} 51 end 52 53 @doc false 54 def __token__(rest, [arg], context, _line, _offset, token_type, attrs) do 55 {rest, [{token_type, attrs, arg}], context} 56 end 57 58 def __token__(rest, arg, context, _line, _offset, token_type, attrs) when is_binary(arg) do 59 {rest, [{token_type, attrs, arg}], context} 60 end 61 62 def __token__(rest, args, context, _line, _offset, token_type, attrs) do 63 {rest, [{token_type, attrs, args |> :lists.reverse()}], context} 64 end 65 66 @doc false 67 def __lexeme__(rest, args, context, _line, _offset) do 68 result = args |> List.wrap() |> :lists.reverse() |> to_string() 69 {rest, [result], context} 70 end 71 72 defp reverse_sort(items) do 73 Enum.sort(items, fn a, b -> {byte_size(a), a} > {byte_size(b), b} end) 74 end 75 76 @doc """ 77 Matches one of the literal strings in the list. 78 79 The strings aren't matched in order: they are automatically sorted in a way 80 that guarantees that the longest strings will be tried first. 81 82 ## Examples 83 84 keywords = word_from_list(~w[do end catch after rescue]) 85 """ 86 def word_from_list(words) do 87 choice(for word <- reverse_sort(words), do: string(word)) 88 end 89 90 @doc """ 91 Matches one of the literal strings in the list and wraps it in a token of the given type. 92 93 This is is just a shorthand. 94 95 The strings aren't matched in order: they are automatically sorted in a way 96 that guarantees that the longest strings will be tried first. 97 98 ## Examples 99 100 keywords = word_from_list(~w[do end catch after rescue], :keyword) 101 """ 102 def word_from_list(words, ttype) do 103 choice(for word <- reverse_sort(words), do: string(word)) |> token(ttype) 104 end 105 106 @doc """ 107 Matches one of the literal strings in the list and wraps it in a token of the given `type`, 108 with the given `attrs`. 109 110 This is is just a shorthand. 111 112 The strings aren't matched in order: they are automatically sorted in a way 113 that guarantees that the longest strings will be tried first. 114 """ 115 def word_from_list(words, ttype, attrs) do 116 choice(for word <- reverse_sort(words), do: string(word)) |> token(ttype, attrs) 117 end 118 119 @doc """ 120 Matches a given combinator, repeated 0 or more times, surrounded by left and right delimiters. 121 122 Delimiters can be combinators or literal strings (either both combinators or both literal strings). 123 """ 124 def many_surrounded_by(combinator, left, right) when is_binary(left) and is_binary(right) do 125 token(left, :punctuation) 126 |> concat( 127 repeat( 128 lookahead_not(string(right)) 129 |> concat(combinator) 130 ) 131 ) 132 |> concat(token(right, :punctuation)) 133 end 134 135 def many_surrounded_by(combinator, left, right) do 136 left 137 |> concat( 138 repeat( 139 lookahead_not(right) 140 |> concat(combinator) 141 ) 142 ) 143 |> concat(right) 144 end 145 146 @doc """ 147 Matches a given combinator, repeated 0 or more times, surrounded by left and right delimiters, 148 and wraps the `right` and `left` delimiters into a token of the given `ttype`. 149 """ 150 def many_surrounded_by(combinator, left, right, ttype) do 151 token(left, ttype) 152 |> concat( 153 repeat( 154 lookahead_not(string(right)) 155 |> concat(combinator) 156 ) 157 ) 158 |> concat(token(right, ttype)) 159 end 160 161 @doc false 162 def collect_raw_chars_and_binaries(rest, args, context, _line, _offset, ttype, attrs) do 163 result = merge_chars_helper(ttype, attrs, [], args) 164 {rest, result, context} 165 end 166 167 defp merge_chars_helper(_ttype, _attrs, [], []), do: [] 168 169 defp merge_chars_helper(ttype, attrs, acc, [next | rest]) 170 when is_integer(next) or is_binary(next) do 171 merge_chars_helper(ttype, attrs, [next | acc], rest) 172 end 173 174 defp merge_chars_helper(ttype, attrs, [], [element | rest]) do 175 [element | merge_chars_helper(ttype, attrs, [], rest)] 176 end 177 178 defp merge_chars_helper(ttype, attrs, acc, list) do 179 tok = {ttype, attrs, acc} 180 [tok | merge_chars_helper(ttype, attrs, [], list)] 181 end 182 183 @doc """ 184 A generic combinator for string-like syntactic structures. 185 186 It takes the following parameters: 187 188 * `left` - left delimiter for the string. Can be a binary or a general combinator. 189 * `right` - right delimiter for the string. Can be a binary or a general combinator 190 * `middle` - a list of parsers to run inside the string which parse entities 191 that aren't characters. 192 The most common example are special characters and string interpolation 193 for languages that support it like Elixir. 194 * `ttype` - the token type to use for the string delimiters and ordinary characters 195 (tokens parsd by the ) 196 * `attrs` - metadata attributes for the string delimiters and ordinary characters 197 198 ## Examples 199 200 single_quoted_heredocs = string_like( 201 "'''", 202 "'''", 203 combinators_inside_string, 204 :string_char 205 ) 206 207 The above is equivalent to the following more explicit version: 208 209 single_quoted_heredocs = string_like( 210 string("'''"), 211 string("'''"), 212 combinators_inside_string, 213 :string_char 214 ) 215 """ 216 def string_like(left, right, middle, ttype, attrs \\ %{}) when is_list(middle) do 217 left_combinator = 218 case is_binary(left) do 219 true -> string(left) 220 false -> left 221 end 222 223 right_combinator = 224 case is_binary(right) do 225 true -> string(right) 226 false -> right 227 end 228 229 choices = middle ++ [utf8_char([])] 230 231 left_combinator 232 |> repeat(lookahead_not(right_combinator) |> choice(choices)) 233 |> concat(right_combinator) 234 |> post_traverse({__MODULE__, :collect_raw_chars_and_binaries, [ttype, attrs]}) 235 end 236 end