earmark_parser.ex (21138B)
1 defmodule EarmarkParser do 2 @type ast_meta :: map() 3 @type ast_tag :: binary() 4 @type ast_attribute_name :: binary() 5 @type ast_attribute_value :: binary() 6 @type ast_attribute :: {ast_attribute_name(), ast_attribute_value()} 7 @type ast_attributes :: list(ast_attribute()) 8 @type ast_tuple :: {ast_tag(), ast_attributes(), ast(), ast_meta()} 9 @type ast_node :: binary() | ast_tuple() 10 @type ast :: list(ast_node()) 11 12 @moduledoc ~S""" 13 14 ### API 15 16 #### EarmarkParser.as_ast 17 18 This is the structure of the result of `as_ast`. 19 20 {:ok, ast, []} = EarmarkParser.as_ast(markdown) 21 {:ok, ast, deprecation_messages} = EarmarkParser.as_ast(markdown) 22 {:error, ast, error_messages} = EarmarkParser.as_ast(markdown) 23 24 For examples see the functiondoc below. 25 26 #### Options 27 28 Options can be passed into `as_ast/2` according to the documentation of `EarmarkParser.Options`. 29 30 {status, ast, errors} = EarmarkParser.as_ast(markdown, options) 31 32 ## Supports 33 34 Standard [Gruber markdown][gruber]. 35 36 [gruber]: <http://daringfireball.net/projects/markdown/syntax> 37 38 ## Extensions 39 40 ### Links 41 42 #### Links supported by default 43 44 ##### Oneline HTML Link tags 45 46 iex(1)> EarmarkParser.as_ast(~s{<a href="href">link</a>}) 47 {:ok, [{"a", [{"href", "href"}], ["link"], %{verbatim: true}}], []} 48 49 ##### Markdown links 50 51 New style ... 52 53 iex(2)> EarmarkParser.as_ast(~s{[title](destination)}) 54 {:ok, [{"p", [], [{"a", [{"href", "destination"}], ["title"], %{}}], %{}}], []} 55 56 and old style 57 58 iex(3)> EarmarkParser.as_ast("[foo]: /url \"title\"\n\n[foo]\n") 59 {:ok, [{"p", [], [{"a", [{"href", "/url"}, {"title", "title"}], ["foo"], %{}}], %{}}], []} 60 61 #### Autolinks 62 63 iex(4)> EarmarkParser.as_ast("<https://elixir-lang.com>") 64 {:ok, [{"p", [], [{"a", [{"href", "https://elixir-lang.com"}], ["https://elixir-lang.com"], %{}}], %{}}], []} 65 66 #### Additional link parsing via options 67 68 69 #### Pure links 70 71 **N.B.** that the `pure_links` option is `true` by default 72 73 iex(5)> EarmarkParser.as_ast("https://github.com") 74 {:ok, [{"p", [], [{"a", [{"href", "https://github.com"}], ["https://github.com"], %{}}], %{}}], []} 75 76 But can be deactivated 77 78 iex(6)> EarmarkParser.as_ast("https://github.com", pure_links: false) 79 {:ok, [{"p", [], ["https://github.com"], %{}}], []} 80 81 82 #### Wikilinks... 83 84 are disabled by default 85 86 iex(7)> EarmarkParser.as_ast("[[page]]") 87 {:ok, [{"p", [], ["[[page]]"], %{}}], []} 88 89 and can be enabled 90 91 iex(8)> EarmarkParser.as_ast("[[page]]", wikilinks: true) 92 {:ok, [{"p", [], [{"a", [{"href", "page"}], ["page"], %{wikilink: true}}], %{}}], []} 93 94 95 ### Sub and Sup HTML Elements 96 97 This feature is not enabled by default but can be enabled with the option `sub_sup: true` 98 99 Therefore we will get 100 101 iex(9)> EarmarkParser.as_ast("H~2~O or a^n^ + b^n^ = c^n^") 102 {:ok, [{"p", [], ["H~2~O or a^n^ + b^n^ = c^n^"], %{}}], []} 103 104 But by specifying `sub_sup: true` 105 106 iex(10)> EarmarkParser.as_ast("H~2~O or a^n^ + b^n^ = c^n^", sub_sup: true) 107 {:ok, [{"p", [], ["H", {"sub", [], ["2"], %{}}, "O or a", {"sup", [], ["n"], %{}}, " + b", {"sup", [], ["n"], %{}}, " = c", {"sup", [], ["n"], %{}}], %{}}], []} 108 109 ### Github Flavored Markdown 110 111 GFM is supported by default, however as GFM is a moving target and all GFM extension do not make sense in a general context, EarmarkParser does not support all of it, here is a list of what is supported: 112 113 #### Strike Through 114 115 iex(11)> EarmarkParser.as_ast("~~hello~~") 116 {:ok, [{"p", [], [{"del", [], ["hello"], %{}}], %{}}], []} 117 118 #### GFM Tables 119 120 Are not enabled by default 121 122 iex(12)> as_ast("a|b\\n-|-\\nc|d\\n") 123 {:ok, [{"p", [], ["a|b\\n-|-\\nc|d\\n"], %{}}], []} 124 125 But can be enabled with `gfm_tables: true` 126 127 iex(13)> as_ast("a|b\n-|-\nc|d\n", gfm_tables: true) 128 {:ok, 129 [ 130 { 131 "table", 132 [], 133 [ 134 {"thead", [], [{"tr", [], [{"th", [{"style", "text-align: left;"}], ["a"], %{}}, {"th", [{"style", "text-align: left;"}], ["b"], %{}}], %{}}], %{}}, 135 {"tbody", [], [{"tr", [], [{"td", [{"style", "text-align: left;"}], ["c"], %{}}, {"td", [{"style", "text-align: left;"}], ["d"], %{}}], %{}}], %{}} 136 ], 137 %{} 138 } 139 ], 140 []} 141 142 #### Syntax Highlighting 143 144 All backquoted or fenced code blocks with a language string are rendered with the given 145 language as a _class_ attribute of the _code_ tag. 146 147 For example: 148 149 iex(14)> [ 150 ...(14)> "```elixir", 151 ...(14)> " @tag :hello", 152 ...(14)> "```" 153 ...(14)> ] |> as_ast() 154 {:ok, [{"pre", [], [{"code", [{"class", "elixir"}], [" @tag :hello"], %{}}], %{}}], []} 155 156 will be rendered as shown in the doctest above. 157 158 If you want to integrate with a syntax highlighter with different conventions you can add more classes by specifying prefixes that will be 159 put before the language string. 160 161 Prism.js for example needs a class `language-elixir`. In order to achieve that goal you can add `language-` 162 as a `code_class_prefix` to `EarmarkParser.Options`. 163 164 In the following example we want more than one additional class, so we add more prefixes. 165 166 iex(15)> [ 167 ...(15)> "```elixir", 168 ...(15)> " @tag :hello", 169 ...(15)> "```" 170 ...(15)> ] |> as_ast(%EarmarkParser.Options{code_class_prefix: "lang- language-"}) 171 {:ok, [{"pre", [], [{"code", [{"class", "elixir lang-elixir language-elixir"}], [" @tag :hello"], %{}}], %{}}], []} 172 173 174 #### Footnotes 175 176 **N.B.** Footnotes are disabled by default, use `as_ast(..., footnotes: true)` to enable them 177 178 Footnotes are now a **superset** of GFM Footnotes. This implies some changes 179 180 - Footnote definitions (`[^footnote_id]`) must come at the end of your document (_GFM_) 181 - Footnotes that are not referenced are not rendered anymore (_GFM_) 182 - Footnote definitions can contain any markup with the exception of footnote definitions 183 184 iex(16)> markdown = [ 185 ...(16)> "My reference[^to_footnote]", 186 ...(16)> "", 187 ...(16)> "[^1]: I am not rendered", 188 ...(16)> "[^to_footnote]: Important information"] 189 ...(16)> {:ok, ast, []} = as_ast(markdown, footnotes: true) 190 ...(16)> ast 191 [ 192 {"p", [], ["My reference", 193 {"a", 194 [{"href", "#fn:to_footnote"}, {"id", "fnref:to_footnote"}, {"class", "footnote"}, {"title", "see footnote"}], 195 ["to_footnote"], %{}} 196 ], %{}}, 197 {"div", 198 [{"class", "footnotes"}], 199 [{"hr", [], [], %{}}, 200 {"ol", [], 201 [{"li", [{"id", "fn:to_footnote"}], 202 [{"a", [{"class", "reversefootnote"}, {"href", "#fnref:to_footnote"}, {"title", "return to article"}], ["↩"], %{}}, 203 {"p", [], ["Important information"], %{}}], %{}} 204 ], %{}}], %{}} 205 ] 206 207 For more complex examples of footnotes, please refer to 208 [these tests](https://github.com/RobertDober/earmark_parser/tree/master/test/acceptance/ast/footnotes/multiple_fn_test.exs) 209 210 #### Breaks 211 212 Hard linebreaks are disabled by default 213 214 iex(17)> ["* a"," b", "c"] 215 ...(17)> |> as_ast() 216 {:ok, 217 [{"ul", [], [{"li", [], ["a\nb\nc"], %{}}], %{}}], 218 []} 219 220 But can be enabled with `breaks: true` 221 222 iex(18)> ["* a"," b", "c"] 223 ...(18)> |> as_ast(breaks: true) 224 {:ok, [{"ul", [], [{"li", [], ["a", {"br", [], [], %{}}, "b", {"br", [], [], %{}}, "c"], %{}}], %{}}], []} 225 226 #### Enabling **all** options that are disabled by default 227 228 Can be achieved with the `all: true` option 229 230 iex(19)> [ 231 ...(19)> "a^n^", 232 ...(19)> "b~2~", 233 ...(19)> "[[wikilink]]"] 234 ...(19)> |> as_ast(all: true) 235 {:ok, [ 236 {"p", [], ["a", {"sup", [], ["n"], %{}}, {"br", [], [], %{}}, "b", {"sub", [], ["2"], %{}}, {"br", [], [], %{}}, {"a", [{"href", "wikilink"}], ["wikilink"], %{wikilink: true}}], %{}} 237 ], 238 []} 239 240 #### Tables 241 242 Are supported as long as they are preceded by an empty line. 243 244 State | Abbrev | Capital 245 ----: | :----: | ------- 246 Texas | TX | Austin 247 Maine | ME | Augusta 248 249 Tables may have leading and trailing vertical bars on each line 250 251 | State | Abbrev | Capital | 252 | ----: | :----: | ------- | 253 | Texas | TX | Austin | 254 | Maine | ME | Augusta | 255 256 Tables need not have headers, in which case all column alignments 257 default to left. 258 259 | Texas | TX | Austin | 260 | Maine | ME | Augusta | 261 262 Currently we assume there are always spaces around interior vertical unless 263 there are exterior bars. 264 265 However in order to be more GFM compatible the `gfm_tables: true` option 266 can be used to interpret only interior vertical bars as a table if a separation 267 line is given, therefore 268 269 Language|Rating 270 --------|------ 271 Elixir | awesome 272 273 is a table (if and only if `gfm_tables: true`) while 274 275 Language|Rating 276 Elixir | awesome 277 278 never is. 279 280 #### HTML Blocks 281 282 HTML is not parsed recursively or detected in all conditions right now, though GFM compliance 283 is a goal. 284 285 But for now the following holds: 286 287 A HTML Block defined by a tag starting a line and the same tag starting a different line is parsed 288 as one HTML AST node, marked with %{verbatim: true} 289 290 E.g. 291 292 iex(20)> lines = [ "<div><span>", "some</span><text>", "</div>more text" ] 293 ...(20)> EarmarkParser.as_ast(lines) 294 {:ok, [{"div", [], ["<span>", "some</span><text>"], %{verbatim: true}}, "more text"], []} 295 296 And a line starting with an opening tag and ending with the corresponding closing tag is parsed in similar 297 fashion 298 299 iex(21)> EarmarkParser.as_ast(["<span class=\"superspan\">spaniel</span>"]) 300 {:ok, [{"span", [{"class", "superspan"}], ["spaniel"], %{verbatim: true}}], []} 301 302 What is HTML? 303 304 We differ from strict GFM by allowing **all** tags not only HTML5 tags this holds for one liners.... 305 306 iex(22)> {:ok, ast, []} = EarmarkParser.as_ast(["<stupid />", "<not>better</not>"]) 307 ...(22)> ast 308 [ 309 {"stupid", [], [], %{verbatim: true}}, 310 {"not", [], ["better"], %{verbatim: true}}] 311 312 and for multi line blocks 313 314 iex(23)> {:ok, ast, []} = EarmarkParser.as_ast([ "<hello>", "world", "</hello>"]) 315 ...(23)> ast 316 [{"hello", [], ["world"], %{verbatim: true}}] 317 318 #### HTML Comments 319 320 Are recognized if they start a line (after ws and are parsed until the next `-->` is found 321 all text after the next '-->' is ignored 322 323 E.g. 324 325 iex(24)> EarmarkParser.as_ast(" <!-- Comment\ncomment line\ncomment --> text -->\nafter") 326 {:ok, [{:comment, [], [" Comment", "comment line", "comment "], %{comment: true}}, {"p", [], ["after"], %{}}], []} 327 328 329 #### Lists 330 331 Lists are pretty much GFM compliant, but some behaviors concerning the interpreation of the markdown inside a List Item's first 332 paragraph seem not worth to be interpreted, examples are blockquote in a tight [list item](ttps://babelmark.github.io/?text=*+aa%0A++%3E+Second) 333 which we can only have in a [loose one](https://babelmark.github.io/?text=*+aa%0A++%0A++%3E+Second) 334 335 Or a headline in a [tight list item](https://babelmark.github.io/?text=*+bb%0A++%23+Headline) which, again is only available in the 336 [loose version](https://babelmark.github.io/?text=*+bb%0A%0A++%23+Headline) in EarmarkParser. 337 338 furthermore [this example](https://babelmark.github.io/?text=*+aa%0A++%60%60%60%0ASecond%0A++%60%60%60) demonstrates how weird 339 and definitely not useful GFM's own interpretation can get. 340 341 Therefore we stick to a more predictable approach. 342 343 iex(25)> markdown = [ 344 ...(25)> "* aa", 345 ...(25)> " ```", 346 ...(25)> "Second", 347 ...(25)> " ```" ] 348 ...(25)> as_ast(markdown) 349 {:ok, [{"ul", [], [{"li", [], ["aa", {"pre", [], [{"code", [], ["Second"], %{}}], %{}}], %{}}], %{}}], []} 350 351 Also we do support the immediate style of block content inside lists 352 353 iex(26)> as_ast("* > Nota Bene!") 354 {:ok, [{"ul", [], [{"li", [], [{"blockquote", [], [{"p", [], ["Nota Bene!"], %{}}], %{}}], %{}}], %{}}], []} 355 356 or 357 358 iex(27)> as_ast("1. # Breaking...") 359 {:ok, [{"ol", [], [{"li", [], [{"h1", [], ["Breaking..."], %{}}], %{}}], %{}}], []} 360 361 362 ### Adding Attributes with the IAL extension 363 364 #### To block elements 365 366 HTML attributes can be added to any block-level element. We use 367 the Kramdown syntax: add the line `{:` _attrs_ `}` following the block. 368 369 iex(28)> markdown = ["# Headline", "{:.from-next-line}"] 370 ...(28)> as_ast(markdown) 371 {:ok, [{"h1", [{"class", "from-next-line"}], ["Headline"], %{}}], []} 372 373 Headers can also have the IAL string at the end of the line 374 375 iex(29)> markdown = ["# Headline{:.from-same-line}"] 376 ...(29)> as_ast(markdown) 377 {:ok, [{"h1", [{"class", "from-same-line"}], ["Headline"], %{}}], []} 378 379 A special use case is headers inside blockquotes which allow for some nifty styling in `ex_doc`* 380 see [this PR](https://github.com/elixir-lang/ex_doc/pull/1400) if you are interested in the technical 381 details 382 383 iex(30)> markdown = ["> # Headline{:.warning}"] 384 ...(30)> as_ast(markdown) 385 {:ok, [{"blockquote", [], [{"h1", [{"class", "warning"}], ["Headline"], %{}}], %{}}], []} 386 387 This also works for headers inside lists 388 389 iex(31)> markdown = ["- # Headline{:.warning}"] 390 ...(31)> as_ast(markdown) 391 {:ok, [{"ul", [], [{"li", [], [{"h1", [{"class", "warning"}], ["Headline"], %{}}], %{}}], %{}}], []} 392 393 It still works for inline code, as it did before 394 395 iex(32)> markdown = "`Enum.map`{:lang=elixir}" 396 ...(32)> as_ast(markdown) 397 {:ok, [{"p", [], [{"code", [{"class", "inline"}, {"lang", "elixir"}], ["Enum.map"], %{}}], %{}}], []} 398 399 400 _attrs_ can be one or more of: 401 402 * `.className` 403 * `#id` 404 * name=value, name="value", or name='value' 405 406 For example: 407 408 # Warning 409 {: .red} 410 411 Do not turn off the engine 412 if you are at altitude. 413 {: .boxed #warning spellcheck="true"} 414 415 #### To links or images 416 417 It is possible to add IAL attributes to generated links or images in the following 418 format. 419 420 iex(33)> markdown = "[link](url) {: .classy}" 421 ...(33)> EarmarkParser.as_ast(markdown) 422 { :ok, [{"p", [], [{"a", [{"class", "classy"}, {"href", "url"}], ["link"], %{}}], %{}}], []} 423 424 For both cases, malformed attributes are ignored and warnings are issued. 425 426 iex(34)> [ "Some text", "{:hello}" ] |> Enum.join("\n") |> EarmarkParser.as_ast() 427 {:error, [{"p", [], ["Some text"], %{}}], [{:warning, 2,"Illegal attributes [\"hello\"] ignored in IAL"}]} 428 429 It is possible to escape the IAL in both forms if necessary 430 431 iex(35)> markdown = "[link](url)\\{: .classy}" 432 ...(35)> EarmarkParser.as_ast(markdown) 433 {:ok, [{"p", [], [{"a", [{"href", "url"}], ["link"], %{}}, "{: .classy}"], %{}}], []} 434 435 This of course is not necessary in code blocks or text lines 436 containing an IAL-like string, as in the following example 437 438 iex(36)> markdown = "hello {:world}" 439 ...(36)> EarmarkParser.as_ast(markdown) 440 {:ok, [{"p", [], ["hello {:world}"], %{}}], []} 441 442 ## Limitations 443 444 * Block-level HTML is correctly handled only if each HTML 445 tag appears on its own line. So 446 447 <div> 448 <div> 449 hello 450 </div> 451 </div> 452 453 will work. However. the following won't 454 455 <div> 456 hello</div> 457 458 * John Gruber's tests contain an ambiguity when it comes to 459 lines that might be the start of a list inside paragraphs. 460 461 One test says that 462 463 This is the text 464 * of a paragraph 465 that I wrote 466 467 is a single paragraph. The "*" is not significant. However, another 468 test has 469 470 * A list item 471 * an another 472 473 and expects this to be a nested list. But, in reality, the second could just 474 be the continuation of a paragraph. 475 476 I've chosen always to use the second interpretation—a line that looks like 477 a list item will always be a list item. 478 479 * Rendering of block and inline elements. 480 481 Block or void HTML elements that are at the absolute beginning of a line end 482 the preceding paragraph. 483 484 Thusly 485 486 mypara 487 <hr /> 488 489 Becomes 490 491 <p>mypara</p> 492 <hr /> 493 494 While 495 496 mypara 497 <hr /> 498 499 will be transformed into 500 501 <p>mypara 502 <hr /></p> 503 504 ## Annotations 505 506 **N.B.** this is an experimental feature from v1.4.16-pre on and might change or be removed again 507 508 The idea is that each markdown line can be annotated, as such annotations change the semantics of Markdown 509 they have to be enabled with the `annotations` option. 510 511 If the `annotations` option is set to a string (only one string is supported right now, but a list might 512 be implemented later on, hence the name), the last occurrence of that string in a line and all text following 513 it will be added to the line as an annotation. 514 515 Depending on how that line will eventually be parsed, this annotation will be added to the meta map (the 4th element 516 in an AST quadruple) with the key `:annotation` 517 518 In the current version the annotation will only be applied to verbatim HTML tags and paragraphs 519 520 Let us show some examples now: 521 522 ### Annotated Paragraphs 523 524 iex(37)> as_ast("hello %> annotated", annotations: "%>") 525 {:ok, [{"p", [], ["hello "], %{annotation: "%> annotated"}}], []} 526 527 If we annotate more than one line in a para the first annotation takes precedence 528 529 iex(38)> as_ast("hello %> annotated\nworld %> discarded", annotations: "%>") 530 {:ok, [{"p", [], ["hello \nworld "], %{annotation: "%> annotated"}}], []} 531 532 ### Annotated HTML elements 533 534 In one line 535 536 iex(39)> as_ast("<span>One Line</span> // a span", annotations: "//") 537 {:ok, [{"span", [], ["One Line"], %{annotation: "// a span", verbatim: true}}], []} 538 539 or block elements 540 541 iex(40)> [ 542 ...(40)> "<div> : annotation", 543 ...(40)> " <span>text</span>", 544 ...(40)> "</div> : discarded" 545 ...(40)> ] |> as_ast(annotations: " : ") 546 {:ok, [{"div", [], [" <span>text</span>"], %{annotation: " : annotation", verbatim: true}}], []} 547 548 ### Commenting your Markdown 549 550 Although many markdown elements do not support annotations yet, they can be used to comment your markdown, w/o cluttering 551 the generated AST with comments 552 553 iex(41)> [ 554 ...(41)> "# Headline --> first line", 555 ...(41)> "- item1 --> a list item", 556 ...(41)> "- item2 --> another list item", 557 ...(41)> "", 558 ...(41)> "<http://somewhere/to/go> --> do not go there" 559 ...(41)> ] |> as_ast(annotations: "-->") 560 {:ok, [ 561 {"h1", [], ["Headline"], %{}}, 562 {"ul", [], [{"li", [], ["item1 "], %{}}, {"li", [], ["item2 "], %{}}], %{}}, 563 {"p", [], [{"a", [{"href", "http://somewhere/to/go"}], ["http://somewhere/to/go"], %{}}, " "], %{annotation: "--> do not go there"}} 564 ], [] 565 } 566 567 """ 568 569 alias EarmarkParser.Options 570 import EarmarkParser.Message, only: [sort_messages: 1] 571 572 @doc """ 573 iex(42)> markdown = "My `code` is **best**" 574 ...(42)> {:ok, ast, []} = EarmarkParser.as_ast(markdown) 575 ...(42)> ast 576 [{"p", [], ["My ", {"code", [{"class", "inline"}], ["code"], %{}}, " is ", {"strong", [], ["best"], %{}}], %{}}] 577 578 579 580 iex(43)> markdown = "```elixir\\nIO.puts 42\\n```" 581 ...(43)> {:ok, ast, []} = EarmarkParser.as_ast(markdown, code_class_prefix: "lang-") 582 ...(43)> ast 583 [{"pre", [], [{"code", [{"class", "elixir lang-elixir"}], ["IO.puts 42"], %{}}], %{}}] 584 585 **Rationale**: 586 587 The AST is exposed in the spirit of [Floki's](https://hex.pm/packages/floki). 588 """ 589 def as_ast(lines, options \\ %Options{}) 590 591 def as_ast(lines, %Options{} = options) do 592 context = _as_ast(lines, options) 593 594 messages = sort_messages(context) 595 messages1 = Options.add_deprecations(options, messages) 596 597 status = 598 case Enum.any?(messages1, fn {severity, _, _} -> 599 severity == :error || severity == :warning 600 end) do 601 true -> :error 602 _ -> :ok 603 end 604 605 {status, context.value, messages1} 606 end 607 608 def as_ast(lines, options) when is_list(options) do 609 as_ast(lines, struct(Options, options)) 610 end 611 612 def as_ast(lines, options) when is_map(options) do 613 as_ast(lines, struct(Options, options |> Map.delete(:__struct__) |> Enum.into([]))) 614 end 615 616 defp _as_ast(lines, options) do 617 {blocks, context} = EarmarkParser.Parser.parse_markdown(lines, Options.normalize(options)) 618 EarmarkParser.AstRenderer.render(blocks, context) 619 end 620 621 @doc """ 622 Accesses current hex version of the `EarmarkParser` application. Convenience for 623 `iex` usage. 624 """ 625 def version() do 626 with {:ok, version} = :application.get_key(:earmark_parser, :vsn), 627 do: to_string(version) 628 end 629 end 630 631 # SPDX-License-Identifier: Apache-2.0