-
Notifications
You must be signed in to change notification settings - Fork 0
/
WebAccesser.py
100 lines (80 loc) · 3.22 KB
/
WebAccesser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import datetime
import logging
import urllib3
import certifi
import reppy
import utils
class WebAccesser():
REQ_HEADERS = {'User-Agent': "Mozilla/5.0 (platform; rv:geckoversion) Gecko/geckotrail Firefox/firefoxversion"}
def __init__(self):
self._poolManager = self._getCustomPoolManager()
self._lastResponse = None
self._lastRequestTimestamp = 0.0
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
@property
def poolManager(self):
raise AttributeError("poolManager is not directly readable")
@poolManager.setter
def poolManager(self, newPool):
raise AttributeError("poolManager is not directly writable")
@property
def lastRequestTimestamp(self) -> float:
return self._lastRequestTimestamp
@lastRequestTimestamp.setter
def lastRequestTimestamp(self, newLastRequestTimestamp):
raise AttributeError("lastRequestTimestamp is not directly writable")
@property
def lastResponse(self) -> urllib3.response.HTTPResponse:
return self._lastResponse
@lastResponse.setter
def lastResponse(self, newPool):
raise AttributeError("lastResponse is not directly writable")
def _getCustomPoolManager(self):
timeout = urllib3.util.Timeout(connect=2.0, read=3.0)
return urllib3.PoolManager(
retries=False,
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.where(),
timeout=timeout
)
def getRobotsOf(self, url:str) -> reppy.Robots:
try:
url = utils.normalizeLinkIfCan(url)
except:
pass
try:
hostRobotsPath = reppy.Robots.robots_url(url)
except:
return None
hostRobots = None
MAX_TIME_REQ_FOR_ROBOTS = 10.0
try:
hostRobots = reppy.Robots.fetch(hostRobotsPath,
timeout=MAX_TIME_REQ_FOR_ROBOTS,
headers=WebAccesser.REQ_HEADERS)
except:
hostRobots = None
return hostRobots
def GETRequest(self, link:str):
self._doRequest('GET', link)
def HEADRequest(self, link:str):
self._doRequest('HEAD', link)
def _doRequest(self, reqType:str, link:str):
now = datetime.datetime.now()
self._lastRequestTimestamp = datetime.datetime.timestamp(now)
self._lastResponse = self._poolManager.request(reqType, link, headers=WebAccesser.REQ_HEADERS)
def lastResponseTextBytes(self) -> bytes:
if self._lastResponse != None:
return self._lastResponse.data
else:
return None
def lastRequestSuccess(self) -> bool:
if self._lastResponse != None:
return self._lastResponse.status >= 200 and self._lastResponse.status < 300
else:
return False
def lastResponseHasTextHtmlContent(self) -> bool:
if self._lastResponse == None:
return False
else:
return 'text/html' in self._lastResponse.getheader('content-type', "")