-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
46 lines (36 loc) · 1.98 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import itertools
import random
import concurrent.futures
import requests
from bs4 import BeautifulSoup
class LightningScraping:
def scrape(self, functions, urlLists):
with concurrent.futures.ThreadPoolExecutor() as executor:
executions = executor.map(functions, urlLists)
class Flattened:
def flat(self, lists):
return list(itertools.chain(*lists))
def get_user_agent():
uastrings = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 "
"Safari/600.1.25",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 "
"Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 "
"Safari/537.85.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"
]
return random.choice(uastrings)
class HimpubCategoryLinkScraper:
def scrape(self, base_url):
lists_of_lists = []
r = requests.get(base_url, headers={"User-Agent": get_user_agent()})
soup = BeautifulSoup(r.content, 'html.parser')
all_category_links = [link.get('href') for link in (links.find_all('a')[1] for links in soup.find_all('h4', class_='panel-title'))]
return all_category_links