Skip to content

Commit

Permalink
Improve regexp to parse html tags
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexanderDokuchaev committed Jun 25, 2024
1 parent 18cca8a commit 252157c
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
8 changes: 4 additions & 4 deletions md_dead_link_check/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@

RE_HEADER = r"^[#]{1,6}\s*(.*)"
RE_LINK = r"([!]{0,1})\[([^\]!]*)\]\(([^\s)]+)(?:\s*(.*?))?\)"
RE_HTML_A_TAG_ID = r"<a\s+id=[\"'](.*?)[\"']>.*?<\/a>"
RE_HTML_A_TAG_HREF = r"<a\s+href=[\"'](.*?)[\"']>.*?<\/a>"
RE_HTML_A_TAG_ID = r"<\w+\s+(?:[^>]*?\s+)?id=([\"'])(.*?)\1"
RE_HTML_A_TAG_HREF = r"<\w+\s+(?:[^>]*?\s+)?href=([\"'])(.*?)\1"
RE_SUB = r"[$`][^`]+?[$`]"


Expand Down Expand Up @@ -113,12 +113,12 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:

# Detect id under a tag <a id="introduction"></a>
matches = re.findall(RE_HTML_A_TAG_ID, line)
for id in matches:
for _, id in matches:
fragments.append(id)

# Detect links under a tag <a href="introduction"></a>
matches = re.findall(RE_HTML_A_TAG_HREF, line)
for link in matches:
for _, link in matches:
links.append(LinkInfo(link, path, line_num))
return MarkdownInfo(path=path, fragments=fragments, links=links)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_md_files/a.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Some text

[github](https://github.com/AlexanderDokuchaev)
[b](./b.md) [d.a](b.md) `[A+B](A)`
<a href="./d/a.md">d.a</a><a href="./d/a.md">d.a</a>
<a href="./d/a.md" target="_blank">d.a</a><span href="./d/a.md">d.a</span>
[d.a](/tests/test_md_files/d/a.md "tag")

### Header with `quotes` and $math$
Expand Down

0 comments on commit 252157c

Please sign in to comment.