-
Notifications
You must be signed in to change notification settings - Fork 0
/
02_getting_titles.py
50 lines (42 loc) · 1.29 KB
/
02_getting_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
with the lessons picked up from the previous script, we can now get the titles of the articles
"""
from bs4 import BeautifulSoup
import requests
from icecream import ic
import asyncio
import json
def get_title(url: str) -> str | None:
"""
Get the title of a webpage as specified by the H1 tag
"""
try:
response = requests.get(url)
except requests.exceptions.RequestException as e:
print(e)
return None
try:
bs = BeautifulSoup(response.text, 'lxml')
title = bs.h1
except AttributeError as e:
print(e)
return None
return title
urls = [
'http://www.pythonscraping.com/pages/page1.html',
'http://www.pythonscraping.com/pages/page2.html',
'http://www.pythonscraping.com/pages/page3.html',
'http://www.pythonscraping.com/pages/page4.html',
'http://www.pythonscraping.com/pages/page5.html',
'http://www.pythonscraping.com/pages/page6.html',
]
async def main():
titles = {}
for url in urls:
title = await asyncio.to_thread(get_title, url)
# always try to access the text attribute of the title tag just before storing it
titles[url] = title.text if title else None
ic(url, title)
ic(titles)
json.dump(titles, open('titles.json', 'w'))
asyncio.run(main())