-
Notifications
You must be signed in to change notification settings - Fork 135
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add an implementation for walking and optionally restructuring an Ear…
…mark AST. (#455) Co-authored-by: Jói Sigurdsson <joi@crankwheel.com>
- Loading branch information
1 parent
947b9ac
commit 97290aa
Showing
2 changed files
with
177 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
defmodule Earmark.Restructure do | ||
|
||
@doc """ | ||
Walks an AST and allows you to process it (storing details in acc) and/or | ||
modify it as it is walked. | ||
items is the AST you got from EarmarkParser.as_ast() | ||
acc is the initial value of an accumulator that is passed to both | ||
process_item_fn and process_list_fn and accumulated. If your functions | ||
do not need to use or store any state, you can pass nil. | ||
The process_item_fn function is required. It takes two parameters, the | ||
single item to process (which will either be a string or a 4-tuple) and | ||
the accumulator, and returns a tuple {processed_item, updated_acc}. | ||
Returning the empty list for processed_item will remove the item processed | ||
the AST. | ||
The process_list_fn function is optional and defaults to no modification of | ||
items or accumulator. It takes two parameters, the list of items that | ||
are the sub-items of a given element in the AST (or the top-level list of | ||
items), and the accumulator, and returns a tuple | ||
{processed_items_list, updated_acc}. | ||
This function ends up returning {ast, acc}. | ||
""" | ||
def walk_and_modify_ast(items, acc, process_item_fn, process_list_fn \\ &({&1, &2})) | ||
when is_list(items) and is_function(process_item_fn) and is_function(process_list_fn) | ||
do | ||
{items, acc} = process_list_fn.(items, acc) | ||
{ast, acc} = Enum.map_reduce(items, acc, fn (item, acc) -> | ||
walk_and_modify_ast_item(item, acc, process_item_fn, process_list_fn) | ||
end) | ||
{List.flatten(ast), acc} | ||
end | ||
|
||
defp walk_and_modify_ast_item(item, acc, process_item_fn, process_list_fn) do | ||
case process_item_fn.(item, acc) do | ||
{{type, attribs, items, annotations}, acc} | ||
when is_binary(type) and is_list(attribs) and is_list(items) and is_map(annotations) -> | ||
{items, acc} = walk_and_modify_ast(items, acc, process_item_fn, process_list_fn) | ||
{{type, attribs, List.flatten(items), annotations}, acc} | ||
{item_or_items, acc} when is_binary(item_or_items) or is_list(item_or_items) -> | ||
{item_or_items, acc} | ||
end | ||
end | ||
|
||
@doc """ | ||
Utility for creating a restructuring that parses text by splitting it into | ||
parts "of interest" vs. "other parts" using a regular expression. | ||
Returns a list of parts where the parts matching regex have been processed | ||
by invoking map_captures_fn on each part, and a list of remaining parts, | ||
preserving the order of parts from what it was in the plain text item. | ||
""" | ||
def text_to_ast_list_splitting_regex(item, regex, map_captures_fn) | ||
when is_binary(item) and is_function(map_captures_fn) do | ||
interest_parts = Regex.scan(regex, item) | ||
|> Enum.map(map_captures_fn) | ||
other_parts = Regex.split(regex, item) | ||
# If the match is at the front of 'item', Regex.split will | ||
# return an empty string "before" the split. Therefore | ||
# the interest_parts always has either the same number of | ||
# elements as the other_parts list, or one fewer. | ||
zigzag_lists(other_parts, interest_parts) | ||
end | ||
|
||
@doc """ | ||
Given two lists that are either of equal length, or with the first list | ||
exactly one element longer than the second, returns a list that begins with | ||
the first element from the first list, then the first element from the first | ||
list, and so forth until both lists are empty. | ||
""" | ||
def zigzag_lists(first, second, acc \\ []) | ||
def zigzag_lists([], [], acc) do | ||
Enum.reverse(acc) | ||
end | ||
def zigzag_lists([first|first_rest], second, acc) do | ||
# Note that there will be no match for an empty 'first' list if 'second' is not empty, | ||
# and this for our use case is on purpose - the lists should either be equal in | ||
# length, or the first list as initially passed into the function should be one longer. | ||
zigzag_lists(second, first_rest, [first|acc]) | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
defmodule RestructureTest do | ||
use ExUnit.Case | ||
|
||
alias Earmark.Restructure | ||
|
||
@doc """ | ||
handle_italics is an example of a structure-changing function, that | ||
takes a non-standard markdown where / is used as an italics marker, | ||
parses for that within text items, and transforms a node containing | ||
such markdown into a new structure with an "em" node. | ||
""" | ||
def handle_italics(ast) do | ||
ast | ||
|> Restructure.walk_and_modify_ast("", &handle_italics_impl/2) | ||
|> elem(0) | ||
end | ||
def handle_italics_impl(item, "a"), do: {item, ""} | ||
def handle_italics_impl(item, acc) when is_binary(item) do | ||
new_item = Restructure.text_to_ast_list_splitting_regex( | ||
item, | ||
~r/\/([[:graph:]].*?[[:graph:]]|[[:graph:]])\//, | ||
fn [_, content] -> | ||
{"em", [], [content], %{}} | ||
end | ||
) | ||
{new_item, acc} | ||
end | ||
def handle_italics_impl({name, _, _, _} = item, _acc) do | ||
# Store the last seen element name so we can skip handling | ||
# italics within <a> elements. | ||
{item, name} | ||
end | ||
|
||
@doc """ | ||
handle_bold is an example of a mostly-structure-preserving function | ||
that simply changes the element type, again to deal with a non-standard | ||
markdown where a single * is used to indicate "strong" text. | ||
""" | ||
def handle_bold(ast) do | ||
ast | ||
|> Restructure.walk_and_modify_ast(nil, &handle_bold_impl/2) | ||
|> elem(0) | ||
end | ||
def handle_bold_impl({"em", attribs, items, annotations}, acc) do | ||
{{"strong", attribs, items, annotations}, acc} | ||
end | ||
def handle_bold_impl(item, acc), do: {item, acc} | ||
|
||
@doc """ | ||
An example of a structure-modifying function that operates on the | ||
list of items in an AST node, removing any italic ("em") items. | ||
""" | ||
def delete_italicized_text(items, acc) do | ||
{ | ||
Enum.flat_map(items, fn item -> | ||
case item do | ||
{"em", _, _, _} -> [] | ||
_ -> [item] | ||
end | ||
end), | ||
acc | ||
} | ||
end | ||
|
||
test "handle_bold_and_italic_from_nonstandard_markdown" do | ||
markdown = "Hello *boldness* my /italic/ friend!" | ||
{:ok, ast, []} = markdown |> EarmarkParser.as_ast() | ||
processed_ast = ast | ||
|> handle_bold() | ||
|> handle_italics() | ||
|
||
assert processed_ast == [ | ||
{ | ||
"p", [], | ||
[ | ||
"Hello ", | ||
{"strong", [], ["boldness"], %{}}, | ||
" my ", | ||
{"em", [], ["italic"], %{}}, | ||
" friend!" | ||
], %{} | ||
} | ||
] | ||
end | ||
|
||
test "delete_italicized_text" do | ||
markdown = "Hello *there* my *good* friend!" | ||
{:ok, ast, []} = markdown |> EarmarkParser.as_ast() | ||
{processed_ast, :acc_unused} = Restructure.walk_and_modify_ast( | ||
ast, :acc_unused, &({&1, &2}), &delete_italicized_text/2) | ||
assert processed_ast == [{"p", [], ["Hello ", " my ", " friend!"], %{}}] | ||
end | ||
end | ||
# SPDX-License-Identifier: Apache-2.0 |