Improve regexp to parse html tags

AlexanderDokuchaev · Jun 25, 2024 · 252157c · 252157c
1 parent 18cca8a
commit 252157c
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 5 deletions.
diff --git a/md_dead_link_check/preprocess.py b/md_dead_link_check/preprocess.py
@@ -10,8 +10,8 @@
 
 RE_HEADER = r"^[#]{1,6}\s*(.*)"
 RE_LINK = r"([!]{0,1})\[([^\]!]*)\]\(([^\s)]+)(?:\s*(.*?))?\)"
-RE_HTML_A_TAG_ID = r"<a\s+id=[\"'](.*?)[\"']>.*?<\/a>"
-RE_HTML_A_TAG_HREF = r"<a\s+href=[\"'](.*?)[\"']>.*?<\/a>"
+RE_HTML_A_TAG_ID = r"<\w+\s+(?:[^>]*?\s+)?id=([\"'])(.*?)\1"
+RE_HTML_A_TAG_HREF = r"<\w+\s+(?:[^>]*?\s+)?href=([\"'])(.*?)\1"
 RE_SUB = r"[$`][^`]+?[$`]"
 
 
@@ -113,12 +113,12 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:
 
             # Detect id under a tag <a id="introduction"></a>
             matches = re.findall(RE_HTML_A_TAG_ID, line)
-            for id in matches:
+            for _, id in matches:
                 fragments.append(id)
 
             # Detect links under a tag <a href="introduction"></a>
             matches = re.findall(RE_HTML_A_TAG_HREF, line)
-            for link in matches:
+            for _, link in matches:
                 links.append(LinkInfo(link, path, line_num))
     return MarkdownInfo(path=path, fragments=fragments, links=links)
 

diff --git a/tests/test_md_files/a.md b/tests/test_md_files/a.md
@@ -26,7 +26,7 @@ Some text
 
 [github](https://github.com/AlexanderDokuchaev)
 [b](./b.md) [d.a](b.md) `[A+B](A)`
-<a href="./d/a.md">d.a</a><a href="./d/a.md">d.a</a>
+<a href="./d/a.md" target="_blank">d.a</a><span href="./d/a.md">d.a</span>
 [d.a](/tests/test_md_files/d/a.md "tag")
 
 ### Header with `quotes` and $math$