-
Notifications
You must be signed in to change notification settings - Fork 0
/
09_crawling_the_web.py
38 lines (24 loc) · 1.06 KB
/
09_crawling_the_web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import requests
from bs4 import BeautifulSoup
url = "https://brianobot.github.io"
html = requests.get(url).text
bs = BeautifulSoup(html, "lxml")
tags_with_brian = bs.find_all(lambda tag: "brian" in tag.text.lower())
wikipedia_url = "http://en.wikipedia.org/wiki/Kevin_Bacon"
html = requests.get(wikipedia_url).text
bs = BeautifulSoup(html, "lxml")
links = bs.find_all('a')
images = bs.find_all('img')
for link in links:print(link.attrs.get('href'))
for image in images:print(image.attrs.get('src'))
print(f"\nNumber of Image found = {len(images)}")
print(f"\nNumber of Links found = {len(links)}")
# Even more effectively we can work with the understanding that links on wikipedia follow certain patterns
# article links contains the following features
# • They reside within the div with the id set to bodyContent.
# • The URLs do not contain colons.
# • The URLs begin with /wiki/.
article_links = bs.find('div', {'id': "bodyContent"}).find_all('a', {'href': re.compile('(/wiki/)((?!:).)*$')})
for link in article_links:
print(link.attrs.get('href'))