line_scanner.ex (9567B)
1 defmodule EarmarkParser.LineScanner do 2 @moduledoc false 3 4 alias EarmarkParser.{Helpers, Line, Options} 5 6 # This is the re that matches the ridiculous "[id]: url title" syntax 7 8 @id_title_part ~S""" 9 (?| 10 " (.*) " # in quotes 11 | ' (.*) ' # 12 | \( (.*) \) # in parens 13 ) 14 """ 15 16 @id_re ~r''' 17 ^\[([^^].*?)\]: # [someid]: 18 \s+ 19 (?| 20 < (\S+) > # url in <>s 21 | (\S+) # or without 22 ) 23 (?: 24 \s+ # optional title 25 #{@id_title_part} 26 )? 27 \s* 28 $ 29 '''x 30 31 @indent_re ~r''' 32 \A ( (?: \s{4})+ ) (\s*) # 4 or more leading spaces 33 (.*) # the rest 34 '''x 35 36 @void_tags ~w{area br hr img wbr} 37 @void_tag_rgx ~r''' 38 ^<( #{Enum.join(@void_tags, "|")} ) 39 .*? 40 > 41 '''x 42 @doc false 43 def void_tag?(tag), do: Regex.match?(@void_tag_rgx, "<#{tag}>") 44 45 def scan_lines(lines, options, recursive) do 46 _lines_with_count(lines, options.line - 1) 47 |> _with_lookahead(options, recursive) 48 end 49 50 def type_of(line, recursive) 51 when is_boolean(recursive), 52 do: type_of(line, %Options{}, recursive) 53 54 def type_of({line, lnb}, options = %Options{annotations: annotations}, recursive) do 55 {line1, annotation} = line |> Helpers.expand_tabs() |> Helpers.remove_line_ending(annotations) 56 %{_type_of(line1, options, recursive) | annotation: annotation, lnb: lnb} 57 end 58 59 defp _type_of(line, options = %Options{}, recursive) do 60 {ial, stripped_line} = Helpers.extract_ial(line) 61 {content, indent} = _count_indent(line, 0) 62 lt_four? = indent < 4 63 64 cond do 65 content == "" -> 66 _create_text(line, content, indent) 67 68 lt_four? && !recursive && Regex.run(~r/\A <! (?: -- .*? -- \s* )+ > \z/x, content) -> 69 %Line.HtmlComment{complete: true, indent: indent, line: line} 70 71 lt_four? && !recursive && Regex.run(~r/\A <!-- .*? \z/x, content) -> 72 %Line.HtmlComment{complete: false, indent: indent, line: line} 73 74 lt_four? && Regex.run(~r/^ (?:-\s?){3,} $/x, content) -> 75 %Line.Ruler{type: "-", indent: indent, line: line} 76 77 lt_four? && Regex.run(~r/^ (?:\*\s?){3,} $/x, content) -> 78 %Line.Ruler{type: "*", indent: indent, line: line} 79 80 lt_four? && Regex.run(~r/\A (?:_\s?){3,} \z/x, content) -> 81 %Line.Ruler{type: "_", indent: indent, line: line} 82 83 match = Regex.run(~R/^(#{1,6})\s+(?|([^#]+)#*\s*$|(.*))/u, stripped_line) -> 84 [_, level, heading] = match 85 86 %Line.Heading{ 87 level: String.length(level), 88 content: String.trim(heading), 89 indent: 0, 90 ial: ial, 91 line: line 92 } 93 94 match = lt_four? && Regex.run(~r/\A>\s?(.*)/, content) -> 95 [_, quote] = match 96 %Line.BlockQuote{content: quote, indent: indent, ial: ial, line: line} 97 98 match = Regex.run(@indent_re, line) -> 99 [_, spaces, more_spaces, rest] = match 100 sl = byte_size(spaces) 101 102 %Line.Indent{ 103 level: div(sl, 4), 104 content: more_spaces <> rest, 105 indent: byte_size(more_spaces) + sl, 106 line: line 107 } 108 109 match = Regex.run(~r/\A(\s*)(`{3,}|~{3,})\s*([^`\s]*)\s*\z/u, line) -> 110 [_, leading, fence, language] = match 111 112 %Line.Fence{ 113 delimiter: fence, 114 language: _attribute_escape(language), 115 indent: byte_size(leading), 116 line: line 117 } 118 119 # Although no block tags I still think they should close a preceding para as do many other 120 # implementations. 121 match = !recursive && Regex.run(@void_tag_rgx, line) -> 122 [_, tag] = match 123 %Line.HtmlOneLine{tag: tag, content: line, indent: 0, line: line} 124 125 match = !recursive && Regex.run(~r{\A<([-\w]+?)(?:\s.*)?>.*</\1>}, line) -> 126 [_, tag] = match 127 %Line.HtmlOneLine{tag: tag, content: line, indent: 0, line: line} 128 129 match = !recursive && Regex.run(~r{\A<([-\w]+?)(?:\s.*)?/>.*}, line) -> 130 [_, tag] = match 131 %Line.HtmlOneLine{tag: tag, content: line, indent: 0, line: line} 132 133 match = !recursive && Regex.run(~r/\A < ([-\w]+?) (?:\s.*)? >/x, line) -> 134 [_, tag] = match 135 %Line.HtmlOpenTag{tag: tag, content: line, indent: 0, line: line} 136 137 match = lt_four? && !recursive && Regex.run(~r/\A<\/([-\w]+?)>/, content) -> 138 [_, tag] = match 139 %Line.HtmlCloseTag{tag: tag, indent: indent, line: line} 140 141 match = lt_four? && Regex.run(@id_re, content) -> 142 [_, id, url | title] = match 143 title = if(length(title) == 0, do: "", else: hd(title)) 144 %Line.IdDef{id: id, url: url, title: title, indent: indent, line: line} 145 146 match = options.footnotes && Regex.run(~r/\A\[\^([^\s\]]+)\]:\s+(.*)/, line) -> 147 [_, id, first_line] = match 148 %Line.FnDef{id: id, content: first_line, indent: 0, line: line} 149 150 match = lt_four? && Regex.run(~r/^([-*+])\s(\s*)(.*)/, content) -> 151 [_, bullet, spaces, text] = match 152 153 %Line.ListItem{ 154 type: :ul, 155 bullet: bullet, 156 content: spaces <> text, 157 indent: indent, 158 list_indent: String.length(bullet <> spaces) + indent + 1, 159 line: line 160 } 161 162 match = lt_four? && Regex.run(~r/^(\d{1,9}[.)])\s(\s*)(.*)/, content) -> 163 _create_list_item(match, indent, line) 164 165 match = Regex.run(~r/^ \| (?: [^|]+ \|)+ \s* $ /x, content) -> 166 [body] = match 167 168 body = 169 body 170 |> String.trim() 171 |> String.trim("|") 172 173 columns = _split_table_columns(body) 174 175 %Line.TableLine{ 176 content: line, 177 columns: columns, 178 is_header: _determine_if_header(columns), 179 indent: indent, 180 line: line 181 } 182 183 line |> String.replace(~r/\[\[ .*? \]\]/x, "") |> String.match?(~r/\A (\s*) .* \s \| \s /x) -> 184 columns = _split_table_columns(line) 185 186 %Line.TableLine{ 187 content: line, 188 columns: columns, 189 is_header: _determine_if_header(columns), 190 indent: indent, 191 line: line 192 } 193 194 options.gfm_tables && line |> String.replace(~r/\[\[ .*? \]\]/x, "") |> String.match?(~r/\A (\s*) .* \| /x) -> 195 columns = _split_table_columns(line) 196 197 %Line.TableLine{ 198 content: line, 199 columns: columns, 200 is_header: _determine_if_header(columns), 201 needs_header: true, 202 indent: indent, 203 line: line 204 } 205 206 match = Regex.run(~r/^(=|-)+\s*$/, line) -> 207 [_, type] = match 208 level = if(String.starts_with?(type, "="), do: 1, else: 2) 209 %Line.SetextUnderlineHeading{level: level, indent: 0, line: line} 210 211 match = lt_four? && Regex.run(~r<^{:(\s*[^}]+)}\s*$>, content) -> 212 [_, ial] = match 213 %Line.Ial{attrs: String.trim(ial), verbatim: ial, indent: indent, line: line} 214 215 true -> 216 _create_text(line, content, indent) 217 end 218 end 219 220 defp _attribute_escape(string), 221 do: 222 string 223 |> String.replace("&", "&") 224 |> String.replace("<", "<") 225 226 defp _create_list_item(match, indent, line) 227 228 defp _create_list_item([_, bullet, spaces, text], indent, line) do 229 sl = byte_size(spaces) 230 sl1 = if sl > 3, do: 1, else: sl + 1 231 sl2 = sl1 + byte_size(bullet) 232 233 %Line.ListItem{ 234 type: :ol, 235 bullet: bullet, 236 content: spaces <> text, 237 indent: indent, 238 list_indent: indent + sl2, 239 line: line 240 } 241 end 242 243 defp _create_text(line) do 244 {content, indent} = _count_indent(line, 0) 245 _create_text(line, content, indent) 246 end 247 248 defp _create_text(line, "", indent), 249 do: %Line.Blank{indent: indent, line: line} 250 251 defp _create_text(line, content, indent), 252 do: %Line.Text{content: content, indent: indent, line: line} 253 254 defp _count_indent(<<space, rest::binary>>, indent) when space in [?\s, ?\t], 255 do: _count_indent(rest, indent + 1) 256 257 defp _count_indent(rest, indent), 258 do: {rest, indent} 259 260 defp _lines_with_count(lines, offset) do 261 Enum.zip(lines, offset..(offset + Enum.count(lines))) 262 end 263 264 defp _with_lookahead([line_lnb | lines], options, recursive) do 265 case type_of(line_lnb, options, recursive) do 266 %Line.Fence{delimiter: delimiter, indent: 0} = fence -> 267 stop = ~r/\A (\s*) (?: #{delimiter} ) \s* ([^`\s]*) \s* \z/xu 268 [fence | _lookahead_until_match(lines, stop, options, recursive)] 269 270 %Line.HtmlComment{complete: false} = html_comment -> 271 [html_comment | _lookahead_until_match(lines, ~r/-->/u, options, recursive)] 272 273 other -> 274 [other | _with_lookahead(lines, options, recursive)] 275 end 276 end 277 278 defp _with_lookahead([], _options, _recursive), do: [] 279 280 defp _lookahead_until_match([], _, _, _), do: [] 281 282 defp _lookahead_until_match([{line, lnb} | lines], regex, options, recursive) do 283 if line =~ regex do 284 [type_of({line, lnb}, options, recursive) | _with_lookahead(lines, options, recursive)] 285 else 286 [ 287 %{_create_text(line) | lnb: lnb} 288 | _lookahead_until_match(lines, regex, options, recursive) 289 ] 290 end 291 end 292 293 @column_rgx ~r{\A[\s|:-]+\z} 294 defp _determine_if_header(columns) do 295 columns 296 |> Enum.all?(fn col -> Regex.run(@column_rgx, col) end) 297 end 298 299 defp _split_table_columns(line) do 300 line 301 |> String.split(~r{(?<!\\)\|}) 302 |> Enum.map(&String.trim/1) 303 |> Enum.map(fn col -> Regex.replace(~r{\\\|}, col, "|") end) 304 end 305 end 306 307 # SPDX-License-Identifier: Apache-2.0