-
Notifications
You must be signed in to change notification settings - Fork 0
/
Parser.py
72 lines (58 loc) · 2.43 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from charset_normalizer import from_bytes
from bs4 import BeautifulSoup
from bs4.element import Comment
import utils
class HTMLParser():
@staticmethod
def parseHTMLBytes(html:str) -> BeautifulSoup:
html = str(from_bytes(html).best())
return BeautifulSoup(html, features="html.parser")
@staticmethod
def getAllLinksFromParsedHTML(parsedHTML:BeautifulSoup) -> set:
allAnchorsFound = parsedHTML.find_all("a")
urlsFound = set()
for anchorTag in allAnchorsFound:
href = anchorTag.get("href")
if href != None and href.strip() != "":
urlsFound.add(href.split()[0])
return urlsFound
@staticmethod
def formatUrlsWithHostIfNeeded(urls, host:str) -> set:
formatedUrls = set()
for url in urls:
if url != None and url.strip() != "":
if url[0] != "#":
url = url.split("#")[0]
formatedUrl = ""
if url[0] == "/":
formatedUrl = f"{host}{url}"
elif (len(url) >= 4 and url[:4] == "http") and not (len(url) >= 5 and url[:5] == "https"):
formatedUrl = "https"+url[4:]
elif (len(url) >= 5 and url[:5] == "https"):
formatedUrl = url
if(formatedUrl != ""):
formatedUrls.add(utils.normalizeLinkIfCan(formatedUrl))
return formatedUrls
@staticmethod
def getNFirstTextWords(parsedHTML: BeautifulSoup, numWords:int) -> str:
allText = HTMLParser.getVisibleTextFromParsedHtml(parsedHTML)
splitedText = allText.split()
if len(splitedText) < numWords:
return " ".join(splitedText)
else:
return " ".join(splitedText[:numWords])
@staticmethod
def getVisibleTextFromParsedHtml(parsedHTML: BeautifulSoup) -> str:
"""
https://stackoverflow.com/a/1983219/16264901
"""
texts = parsedHTML.findAll(text=True)
visible_texts = filter(HTMLParser.tag_visible, texts)
return " ".join(text.strip() for text in visible_texts)
@staticmethod
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True