From ac2287e31ad281eb63d0fd64214852b1111357bd Mon Sep 17 00:00:00 2001 From: Robin Wilson Date: Thu, 23 Nov 2023 09:42:23 +0000 Subject: [PATCH] Identify where multiple

 

tags have been added and give warning. Also remove warning for old way of detecting mixed top/non-top content. Fixes #548. --- parser/lman_parser.py | 34 ++++++++++++++++++++++++++++++++++ parser/parser_utils.py | 6 +++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/parser/lman_parser.py b/parser/lman_parser.py index 5a6ecb02..8265f394 100644 --- a/parser/lman_parser.py +++ b/parser/lman_parser.py @@ -8,6 +8,7 @@ import os import subprocess from urllib.parse import urlparse +import bs4 from bs4 import BeautifulSoup from pprint import pprint, pformat from html_to_dita import htmlToDITA @@ -565,6 +566,39 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="") # insert rest of converted content dita_section.extend(converted_bits) + def is_empty_p_element(el): + if el is None: + return False + elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0: + return True + else: + return False + + def next_sibling_tag(el): + next_sib = el.next_sibling + while type(next_sib) is bs4.element.NavigableString: + next_sib = next_sib.next_sibling + + return next_sib + + # Check for repeated

 

elements + p_elements = page.find_all("p") + empty_p_elements = list(filter(is_empty_p_element, p_elements)) + + found = False + for el in empty_p_elements: + count = 0 + while is_empty_p_element(next_sibling_tag(el)): + count += 1 + if count >= 4: + found = True + break + if found: + logging.warning( + f"Found string of repeated

 

elements in div with ID {page.get('id')} in file {filename}" + ) + break + return dita_section def find_first_page_layer(self, top_to_div_mapping, html_soup): diff --git a/parser/parser_utils.py b/parser/parser_utils.py index 8f473226..cc445422 100644 --- a/parser/parser_utils.py +++ b/parser/parser_utils.py @@ -191,9 +191,9 @@ def generate_top_to_div_mapping( # exited in an earlier if statement), so we check if there are some elements without top values # and raise a warning if so if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1: - logging.warning( - f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}" - ) + # logging.warning( + # f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}" + # ) return [(0, html_soup)] return top_to_div_mapping