Add an implementation for walking and optionally restructuring an Ear…

…mark AST. (#455) Co-authored-by: Jói Sigurdsson <joi@crankwheel.com>
pragdave · Nov 7, 2022 · 97290aa · 97290aa
1 parent 947b9ac
commit 97290aa
Show file tree

Hide file tree

Showing 2 changed files with 177 additions and 0 deletions.
diff --git a/lib/earmark/restructure.ex b/lib/earmark/restructure.ex
@@ -0,0 +1,83 @@
+defmodule Earmark.Restructure do
+
+  @doc """
+  Walks an AST and allows you to process it (storing details in acc) and/or
+  modify it as it is walked.
+
+  items is the AST you got from EarmarkParser.as_ast()
+
+  acc is the initial value of an accumulator that is passed to both
+  process_item_fn and process_list_fn and accumulated. If your functions
+  do not need to use or store any state, you can pass nil.
+
+  The process_item_fn function is required. It takes two parameters, the
+  single item to process (which will either be a string or a 4-tuple) and
+  the accumulator, and returns a tuple {processed_item, updated_acc}.
+  Returning the empty list for processed_item will remove the item processed
+  the AST.
+
+  The process_list_fn function is optional and defaults to no modification of
+  items or accumulator. It takes two parameters, the list of items that
+  are the sub-items of a given element in the AST (or the top-level list of
+  items), and the accumulator, and returns a tuple
+  {processed_items_list, updated_acc}.
+
+  This function ends up returning {ast, acc}.
+  """
+  def walk_and_modify_ast(items, acc, process_item_fn, process_list_fn \\ &({&1, &2}))
+  when is_list(items) and is_function(process_item_fn) and is_function(process_list_fn)
+  do
+    {items, acc} = process_list_fn.(items, acc)
+    {ast, acc} = Enum.map_reduce(items, acc, fn (item, acc) ->
+      walk_and_modify_ast_item(item, acc, process_item_fn, process_list_fn)
+    end)
+    {List.flatten(ast), acc}
+  end
+
+  defp walk_and_modify_ast_item(item, acc, process_item_fn, process_list_fn) do
+    case process_item_fn.(item, acc) do
+      {{type, attribs, items, annotations}, acc}
+      when is_binary(type) and is_list(attribs) and is_list(items) and is_map(annotations) ->
+        {items, acc} = walk_and_modify_ast(items, acc, process_item_fn, process_list_fn)
+        {{type, attribs, List.flatten(items), annotations}, acc}
+      {item_or_items, acc} when is_binary(item_or_items) or is_list(item_or_items) ->
+        {item_or_items, acc}
+    end
+  end
+
+  @doc """
+  Utility for creating a restructuring that parses text by splitting it into
+  parts "of interest" vs. "other parts" using a regular expression.
+  Returns a list of parts where the parts matching regex have been processed
+  by invoking map_captures_fn on each part, and a list of remaining parts,
+  preserving the order of parts from what it was in the plain text item.
+  """
+  def text_to_ast_list_splitting_regex(item, regex, map_captures_fn)
+  when is_binary(item) and is_function(map_captures_fn) do
+    interest_parts = Regex.scan(regex, item)
+    |> Enum.map(map_captures_fn)
+    other_parts = Regex.split(regex, item)
+    # If the match is at the front of 'item', Regex.split will
+    # return an empty string "before" the split. Therefore
+    # the interest_parts always has either the same number of
+    # elements as the other_parts list, or one fewer.
+    zigzag_lists(other_parts, interest_parts)
+  end
+
+  @doc """
+  Given two lists that are either of equal length, or with the first list
+  exactly one element longer than the second, returns a list that begins with
+  the first element from the first list, then the first element from the first
+  list, and so forth until both lists are empty.
+  """
+  def zigzag_lists(first, second, acc \\ [])
+  def zigzag_lists([], [], acc) do
+    Enum.reverse(acc)
+  end
+  def zigzag_lists([first|first_rest], second, acc) do
+    # Note that there will be no match for an empty 'first' list if 'second' is not empty,
+    # and this for our use case is on purpose - the lists should either be equal in
+    # length, or the first list as initially passed into the function should be one longer.
+    zigzag_lists(second, first_rest, [first|acc])
+  end
+end
diff --git a/test/restructure_test.exs b/test/restructure_test.exs
@@ -0,0 +1,94 @@
+defmodule RestructureTest do
+  use ExUnit.Case
+
+  alias Earmark.Restructure
+
+  @doc """
+  handle_italics is an example of a structure-changing function, that
+  takes a non-standard markdown where / is used as an italics marker,
+  parses for that within text items, and transforms a node containing
+  such markdown into a new structure with an "em" node.
+  """
+  def handle_italics(ast) do
+    ast
+    |> Restructure.walk_and_modify_ast("", &handle_italics_impl/2)
+    |> elem(0)
+  end
+  def handle_italics_impl(item, "a"), do: {item, ""}
+  def handle_italics_impl(item, acc) when is_binary(item) do
+    new_item = Restructure.text_to_ast_list_splitting_regex(
+      item,
+      ~r/\/([[:graph:]].*?[[:graph:]]|[[:graph:]])\//,
+      fn [_, content] ->
+        {"em", [], [content], %{}}
+      end
+    )
+    {new_item, acc}
+  end
+  def handle_italics_impl({name, _, _, _} = item, _acc) do
+    # Store the last seen element name so we can skip handling
+    # italics within <a> elements.
+    {item, name}
+  end
+
+  @doc """
+  handle_bold is an example of a mostly-structure-preserving function
+  that simply changes the element type, again to deal with a non-standard
+  markdown where a single * is used to indicate "strong" text.
+  """
+  def handle_bold(ast) do
+    ast
+    |> Restructure.walk_and_modify_ast(nil, &handle_bold_impl/2)
+    |> elem(0)
+  end
+  def handle_bold_impl({"em", attribs, items, annotations}, acc) do
+    {{"strong", attribs, items, annotations}, acc}
+  end
+  def handle_bold_impl(item, acc), do: {item, acc}
+
+  @doc """
+  An example of a structure-modifying function that operates on the
+  list of items in an AST node, removing any italic ("em") items.
+  """
+  def delete_italicized_text(items, acc) do
+    {
+      Enum.flat_map(items, fn item ->
+        case item do
+          {"em", _, _, _} -> []
+          _ -> [item]
+        end
+      end),
+      acc
+    }
+  end
+
+  test "handle_bold_and_italic_from_nonstandard_markdown" do
+    markdown = "Hello *boldness* my /italic/ friend!"
+    {:ok, ast, []} = markdown |> EarmarkParser.as_ast()
+    processed_ast = ast
+    |> handle_bold()
+    |> handle_italics()
+
+    assert processed_ast == [
+      {
+        "p", [],
+        [
+          "Hello ",
+          {"strong", [], ["boldness"], %{}},
+          " my ",
+          {"em", [], ["italic"], %{}},
+          " friend!"
+        ], %{}
+      }
+    ]
+  end
+
+  test "delete_italicized_text" do
+    markdown = "Hello *there* my *good* friend!"
+    {:ok, ast, []} = markdown |> EarmarkParser.as_ast()
+    {processed_ast, :acc_unused} = Restructure.walk_and_modify_ast(
+      ast, :acc_unused, &({&1, &2}), &delete_italicized_text/2)
+    assert processed_ast == [{"p", [], ["Hello ", " my ", " friend!"], %{}}]
+  end
+end
+#  SPDX-License-Identifier: Apache-2.0