lexer.ex (10129B)
1 defmodule Absinthe.Lexer do 2 import NimbleParsec 3 4 # Codepoints 5 @horizontal_tab 0x0009 6 @newline 0x000A 7 @carriage_return 0x000D 8 @space 0x0020 9 @unicode_bom 0xFEFF 10 11 # SourceCharacter :: /[\u0009\u000A\u000D\u0020-\uFFFF]/ 12 13 any_unicode = utf8_char([]) 14 15 # ## Ignored Tokens 16 17 # UnicodeBOM :: "Byte Order Mark (U+FEFF)" 18 unicode_bom = utf8_char([@unicode_bom]) 19 20 # WhiteSpace :: 21 # - "Horizontal Tab (U+0009)" 22 # - "Space (U+0020)" 23 whitespace = 24 ascii_char([ 25 @horizontal_tab, 26 @space 27 ]) 28 29 # LineTerminator :: 30 # - "New Line (U+000A)" 31 # - "Carriage Return (U+000D)" [ lookahead ! "New Line (U+000A)" ] 32 # - "Carriage Return (U+000D)" "New Line (U+000A)" 33 line_terminator = 34 choice([ 35 ascii_char([@newline]), 36 ascii_char([@carriage_return]) 37 |> optional(ascii_char([@newline])) 38 ]) 39 40 # Comment :: `#` CommentChar* 41 # CommentChar :: SourceCharacter but not LineTerminator 42 comment = 43 string("#") 44 |> repeat_while(any_unicode, {:not_line_terminator, []}) 45 46 # Comma :: , 47 comma = ascii_char([?,]) 48 49 # Ampersand :: & 50 ampersand = ascii_char([?&]) 51 52 # Ignored :: 53 # - UnicodeBOM 54 # - WhiteSpace 55 # - LineTerminator 56 # - Comment 57 # - Comma 58 # - Ampersand 59 ignored = 60 choice([ 61 unicode_bom, 62 whitespace, 63 line_terminator, 64 comment, 65 comma, 66 ampersand 67 ]) 68 69 # ## Lexical Tokens 70 71 # - Punctuator 72 # - Name 73 # - IntValue 74 # - FloatValue 75 # - StringValue 76 77 punctuator = 78 choice([ 79 ascii_char([ 80 ?!, 81 ?$, 82 ?(, 83 ?), 84 ?:, 85 ?=, 86 ?@, 87 ?[, 88 ?], 89 ?{, 90 ?|, 91 ?} 92 ]), 93 times(ascii_char([?.]), 3) 94 ]) 95 |> post_traverse({:atom_token, []}) 96 97 boolean_value_or_name_or_reserved_word = 98 ascii_char([?_, ?A..?Z, ?a..?z]) 99 |> repeat(ascii_char([?_, ?0..?9, ?A..?Z, ?a..?z])) 100 |> post_traverse({:boolean_value_or_name_or_reserved_word, []}) 101 102 # NegativeSign :: - 103 negative_sign = ascii_char([?-]) 104 105 # Digit :: one of 0 1 2 3 4 5 6 7 8 9 106 digit = ascii_char([?0..?9]) 107 108 # NonZeroDigit :: Digit but not `0` 109 non_zero_digit = ascii_char([?1..?9]) 110 111 # IntegerPart :: 112 # - NegativeSign? 0 113 # - NegativeSign? NonZeroDigit Digit* 114 integer_part = 115 optional(negative_sign) 116 |> choice([ 117 ascii_char([?0]), 118 non_zero_digit |> repeat(digit) 119 ]) 120 121 # IntValue :: IntegerPart 122 int_value = 123 empty() 124 |> concat(integer_part) 125 |> post_traverse({:labeled_token, [:int_value]}) 126 127 # FractionalPart :: . Digit+ 128 fractional_part = 129 ascii_char([?.]) 130 |> times(digit, min: 1) 131 132 # ExponentIndicator :: one of `e` `E` 133 exponent_indicator = ascii_char([?e, ?E]) 134 135 # Sign :: one of + - 136 sign = ascii_char([?+, ?-]) 137 138 # ExponentPart :: ExponentIndicator Sign? Digit+ 139 exponent_part = 140 exponent_indicator 141 |> optional(sign) 142 |> times(digit, min: 1) 143 144 # FloatValue :: 145 # - IntegerPart FractionalPart 146 # - IntegerPart ExponentPart 147 # - IntegerPart FractionalPart ExponentPart 148 float_value = 149 choice([ 150 integer_part |> concat(fractional_part) |> concat(exponent_part), 151 integer_part |> post_traverse({:fill_mantissa, []}) |> concat(exponent_part), 152 integer_part |> concat(fractional_part) 153 ]) 154 |> post_traverse({:labeled_token, [:float_value]}) 155 156 # EscapedUnicode :: /[0-9A-Fa-f]{4}/ 157 escaped_unicode = 158 times(ascii_char([?0..?9, ?A..?F, ?a..?f]), 4) 159 |> post_traverse({:unescape_unicode, []}) 160 161 # EscapedCharacter :: one of `"` \ `/` b f n r t 162 escaped_character = 163 choice([ 164 ascii_char([?"]), 165 ascii_char([?\\]), 166 ascii_char([?/]), 167 ascii_char([?b]) |> replace(?\b), 168 ascii_char([?f]) |> replace(?\f), 169 ascii_char([?n]) |> replace(?\n), 170 ascii_char([?r]) |> replace(?\r), 171 ascii_char([?t]) |> replace(?\t) 172 ]) 173 174 # StringCharacter :: 175 # - SourceCharacter but not `"` or \ or LineTerminator 176 # - \u EscapedUnicode 177 # - \ EscapedCharacter 178 string_character = 179 choice([ 180 ignore(string(~S(\u))) |> concat(escaped_unicode), 181 ignore(ascii_char([?\\])) |> concat(escaped_character), 182 any_unicode 183 ]) 184 185 # BlockStringCharacter :: 186 # - SourceCharacter but not `"""` or `\"""` 187 # - `\"""` 188 189 # Note: Block string values are interpreted to exclude blank initial and trailing 190 # lines and uniform indentation with {BlockStringValue()}. 191 block_string_character = 192 choice([ 193 ignore(ascii_char([?\\])) |> concat(times(ascii_char([?"]), 3)), 194 any_unicode 195 ]) 196 197 # StringValue :: 198 # - `"` StringCharacter* `"` 199 # - `"""` BlockStringCharacter* `"""` 200 string_value = 201 ignore(ascii_char([?"])) 202 |> post_traverse({:mark_string_start, []}) 203 |> repeat_while(string_character, {:not_end_of_quote, []}) 204 |> ignore(ascii_char([?"])) 205 |> post_traverse({:string_value_token, []}) 206 207 block_string_value = 208 ignore(string(~S("""))) 209 |> post_traverse({:mark_block_string_start, []}) 210 |> repeat_while(block_string_character, {:not_end_of_block_quote, []}) 211 |> ignore(string(~S("""))) 212 |> post_traverse({:block_string_value_token, []}) 213 214 defp not_end_of_quote(<<?", _::binary>>, context, _, _) do 215 {:halt, context} 216 end 217 218 defp not_end_of_quote(rest, context, current_line, current_offset) do 219 not_line_terminator(rest, context, current_line, current_offset) 220 end 221 222 defp not_end_of_block_quote(<<?", ?", ?", _::binary>>, context, _, _) do 223 {:halt, context} 224 end 225 226 defp not_end_of_block_quote(_, context, _, _) do 227 {:cont, context} 228 end 229 230 @spec tokenize(binary()) :: {:ok, [any()]} | {:error, binary(), {integer(), non_neg_integer()}} 231 def tokenize(input) do 232 lines = String.split(input, ~r/\r?\n/) 233 234 case do_tokenize(input) do 235 {:ok, tokens, "", _, _, _} -> 236 tokens = Enum.map(tokens, &convert_token_column(&1, lines)) 237 {:ok, tokens} 238 239 {:ok, _, rest, _, {line, line_offset}, byte_offset} -> 240 byte_column = byte_offset - line_offset + 1 241 {:error, rest, byte_loc_to_char_loc({line, byte_column}, lines)} 242 end 243 end 244 245 defp convert_token_column({ident, loc, data}, lines) do 246 {ident, byte_loc_to_char_loc(loc, lines), data} 247 end 248 249 defp convert_token_column({ident, loc}, lines) do 250 {ident, byte_loc_to_char_loc(loc, lines)} 251 end 252 253 defp byte_loc_to_char_loc({line, byte_col}, lines) do 254 current_line = Enum.at(lines, line - 1) 255 byte_prefix = binary_part(current_line, 0, byte_col) 256 char_col = String.length(byte_prefix) 257 {line, char_col} 258 end 259 260 @spec do_tokenize(binary()) :: 261 {:ok, [any()], binary(), map(), {pos_integer(), pos_integer()}, pos_integer()} 262 defparsec( 263 :do_tokenize, 264 repeat( 265 choice([ 266 ignore(ignored), 267 comment, 268 punctuator, 269 block_string_value, 270 string_value, 271 float_value, 272 int_value, 273 boolean_value_or_name_or_reserved_word 274 ]) 275 ) 276 ) 277 278 defp fill_mantissa(_rest, raw, context, _, _), do: {'0.' ++ raw, context} 279 280 defp unescape_unicode(_rest, content, context, _loc, _) do 281 code = content |> Enum.reverse() 282 value = :erlang.list_to_integer(code, 16) 283 binary = :unicode.characters_to_binary([value]) 284 {[binary], context} 285 end 286 287 @boolean_words ~w( 288 true 289 false 290 ) |> Enum.map(&String.to_charlist/1) 291 292 @reserved_words ~w( 293 directive 294 enum 295 extend 296 fragment 297 implements 298 input 299 interface 300 mutation 301 null 302 on 303 ON 304 query 305 repeatable 306 scalar 307 schema 308 subscription 309 type 310 union 311 ) |> Enum.map(&String.to_charlist/1) 312 313 defp boolean_value_or_name_or_reserved_word(rest, chars, context, loc, byte_offset) do 314 value = chars |> Enum.reverse() 315 do_boolean_value_or_name_or_reserved_word(rest, value, context, loc, byte_offset) 316 end 317 318 defp do_boolean_value_or_name_or_reserved_word(_rest, value, context, loc, byte_offset) 319 when value in @boolean_words do 320 {[{:boolean_value, line_and_column(loc, byte_offset, length(value)), value}], context} 321 end 322 323 defp do_boolean_value_or_name_or_reserved_word(_rest, value, context, loc, byte_offset) 324 when value in @reserved_words do 325 token_name = value |> List.to_atom() 326 {[{token_name, line_and_column(loc, byte_offset, length(value))}], context} 327 end 328 329 defp do_boolean_value_or_name_or_reserved_word(_rest, value, context, loc, byte_offset) do 330 {[{:name, line_and_column(loc, byte_offset, length(value)), value}], context} 331 end 332 333 defp labeled_token(_rest, chars, context, loc, byte_offset, token_name) do 334 value = chars |> Enum.reverse() 335 {[{token_name, line_and_column(loc, byte_offset, length(value)), value}], context} 336 end 337 338 defp mark_string_start(_rest, chars, context, loc, byte_offset) do 339 {[chars], Map.put(context, :token_location, line_and_column(loc, byte_offset, 1))} 340 end 341 342 defp mark_block_string_start(_rest, _chars, context, loc, byte_offset) do 343 {[], Map.put(context, :token_location, line_and_column(loc, byte_offset, 3))} 344 end 345 346 defp block_string_value_token(_rest, chars, context, _loc, _byte_offset) do 347 value = '"""' ++ (chars |> Enum.reverse()) ++ '"""' 348 {[{:block_string_value, context.token_location, value}], Map.delete(context, :token_location)} 349 end 350 351 defp string_value_token(_rest, chars, context, _loc, _byte_offset) do 352 value = '"' ++ tl(chars |> Enum.reverse()) ++ '"' 353 {[{:string_value, context.token_location, value}], Map.delete(context, :token_location)} 354 end 355 356 defp atom_token(_rest, chars, context, loc, byte_offset) do 357 value = chars |> Enum.reverse() 358 token_atom = value |> List.to_atom() 359 {[{token_atom, line_and_column(loc, byte_offset, length(value))}], context} 360 end 361 362 def line_and_column({line, line_offset}, byte_offset, column_correction) do 363 column = byte_offset - line_offset - column_correction + 1 364 {line, column} 365 end 366 367 defp not_line_terminator(<<?\n, _::binary>>, context, _, _), do: {:halt, context} 368 defp not_line_terminator(<<?\r, _::binary>>, context, _, _), do: {:halt, context} 369 defp not_line_terminator(_, context, _, _), do: {:cont, context} 370 end