-
Notifications
You must be signed in to change notification settings - Fork 7
/
beauti.py
291 lines (253 loc) · 12.9 KB
/
beauti.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
__author="Anurag"
from config import Naukri_config
from bs4 import BeautifulSoup
from selenium import webdriver
import logging,datetime,time,random
config = Naukri_config()
from config import logfile,logformat
import lxml
class job_link_scraping():
def __init__(self):
self.logger = logging.getLogger("Naukri job links scraping")
self.logger.setLevel(logging.DEBUG)
logging.basicConfig(filename=logfile)
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(logformat)
handler.setFormatter(formatter)
self.logger.addHandler(handler)
def linksAutomation(self, database, keyword, location=None, browser="chrome"):
if browser == "firefox":
self.driver = webdriver.Firefox()
elif browser == "chrome":
self.driver = webdriver.Chrome(config['chromepath'])
else:
self.driver = webdriver.PhantomJS(config["phantomjspath"])
try:
if location:
url = "https://www.naukri.com/" + str(keyword.lower() + "-jobs-in-" + location).replace(" ", "-").lower()
else:
url="https://www.naukri.com/" + str(keyword.lower() + "-jobs").replace(" ", "-").lower()
print("successful")
self.logger.info("search url - %s", url)
self.driver.get(url=url)
time.sleep(5)
pageCount = self.pageCount(pagesource=self.driver.page_source)
print(pageCount)
self.logger.info("pages to search - %s", pageCount)
for i in range(1, pageCount):
if i == 1:
self.searchUrl = url
print("search")
else:
self.searchUrl = str(url + "-" + str(i))
print("search url")
self.logger.info("search url - %s", self.searchUrl)
self.driver.get(self.searchUrl)
time.sleep(random.randint(5, 10))
linksData = self.linksExtraction(pagesource=self.driver.page_source)
print("pagesource")
if linksData:
for data in linksData:
data["searchKey"] = keyword
data["searchLocation"] = location
try:
self.logger.info("job links - %s",data)
self.logger.info("inserting job links into database - %s", database.insert(data))
except Exception as e:
self.logger.fatal("exception in inserting job Links - %s", e)
pass
print("job links")
except Exception as e:
self.logger.fatal("excecption in job Links Searching - %s", e)
pass
self.driver.close()
self.driver.quit()
def pageCount(self, pagesource):
Page= BeautifulSoup(pagesource, "lxml")
try:
result = str(Page.find(config["job_count"]["name"], config["job_count"]["attrs"]).text).split(" of ")[-1]
pageCount = round(int(result) / 50)
self.logger.info("total jobs found - %s",result)
return pageCount
except Exception as e:
self.logger.fatal("exception in finding total Job Links - %s", e)
def linksExtraction(self, pagesource):
descriptionArray = []
Page = BeautifulSoup(pagesource, "lxml")
########### Finding Job Links ############
try:
for section in Page.find_all(config["job_section"]["name"], config["job_section"]["attrs"]):
descriptionurl = ""
descriptionId = ""
descriptionTitle = ""
employerName = ""
exprequired = ""
location = ""
skillsRequired = ""
summary = ""
salary = ""
recruiter = ""
postedDay = ""
############ Finding description Id ########
try:
descriptionId = section["id"].strip()
except Exception as e:
self.logger.exception("exception in Job Id - %s", e)
pass
############## Finding Jobs Url ############ ["href"].strip().split(descriptionId)[0]
try:
descriptionurl = section.attrs['data-url']
print( descriptionurl)
except Exception as e:
self.logger.exception("exception in Jobs Url - %s", e)
pass
############## finding Job title ###########
try:
descriptionTitle = section.find(config["job_title"]["name"], config["job_title"]["attrs"]).text
except Exception as e:
self.logger.exception("exception in Job Title - %s", e)
pass
########## Finding Hiring Organisation #########
try:
employerName = section.find(config["job_employer"]["name"], config["job_employer"]["attrs"]).text
except Exception as e:
self.logger.exception("exception in Job hiringOrganization - %s", e)
pass
########## finding Exp Required ##############
try:
exprequired = section.find(config["job_experience"]["name"], config["job_experience"]["attrs"]).text.strip()
except Exception as e:
self.logger.exception("exception in Job Exp Required - %s", e)
pass
########## Finding Job Location ##############
try:
location = section.find(config["job_location"]["name"], config["job_location"]["attrs"]).text.strip()
except Exception as e:
self.logger.exception("exception in job Location - %s", e)
pass
######### Finding Job Skills #################
try:
skillsRequired = section.find(config["job_skills"]["name"], config["job_skills"]["attrs"]).text.strip()
except Exception as e:
self.logger.exception("exception in job skills - %s", e)
pass
######### Finding Job Description ############
try:
summary = section.find(config["job_summary"]["name"], config["job_summary"]["attrs"]).text.strip()
except Exception as e:
self.logger.exception("exception in job summary - %s", e)
pass
######## Finding Jobs Salary #################
try:
salary = section.find(config["job_salary"]["name"], config["job_salary"]["attrs"]).text.strip()
except Exception as e:
self.logger.exception("exception in job salary - %s", e)
pass
######## Finding Posted By ###################
try:
recruiter = section.find(config["job_recruiter"]["name"], config["job_recruiter"]["attrs"]).text.strip()
except Exception as e:
self.logger.exception("exception in job Recruiter - %s", e)
pass
######## Finding Job Posted Date #############
try:
postedDay = section.find(config["job_posted"]["name"], config["job_posted"]["attrs"])
if postedDay:
postedDay = postedDay.text
else:
postedDay = " "
except Exception as e:
self.logger.exception("exception in job Posted Date - %s", e)
pass
# print("********************************************")
# print("joburl -------------------->", descriptionurl)
# print("descriptionId ------------->", descriptionId)
# print("jobtitle ------------------>", descriptionTitle)
# print("hiringOrganization--------->", employerName)
# print("joblocation---------------->", location)
# print("skillsRequired------------->", skillsRequired)
# print("jobDescription ----------->", summary)
# print("salary--------------------->", salary)
# print("recruiter------------------>", recruiter)
# print("jobPosted------------------>", postedDay)
# print("********************************************")
descriptionDict = {}
descriptionDict["_id"] = descriptionurl
descriptionDict["jobDescriptionID"] = descriptionId
descriptionDict["jobDescriptionURL"] = descriptionurl
descriptionDict["jobTitle"] = descriptionTitle
descriptionDict["employer"] = employerName
descriptionDict["jobLocation"] = location
descriptionDict["jobSummary"] = summary
descriptionDict["jobSalary"] = salary
descriptionDict["jobPosted"] = postedDay
descriptionDict["jobExperience"] = exprequired
descriptionDict["scrapTime"] = datetime.datetime.now()
descriptionDict["postType"] = "general"
descriptionDict["processFlag"] = "false"
descriptionDict["source"] = "Naukri"
descriptionDict["jobType"] = ""
descriptionDict["moreJobsURL"] = ""
descriptionDict["postedBy"] = recruiter
descriptionArray.append(descriptionDict)
except Exception as e:
self.logger.fatal("exception in Job Links Extraction - %s", e)
return descriptionArray
class job_description_scraping():
def __init__(self):
self.logger = logging.getLogger("Naukri job description scraping")
self.logger.setLevel(logging.DEBUG)
logging.basicConfig(filename=logfile)
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(logformat)
handler.setFormatter(formatter)
self.logger.addHandler(handler)
def descriptionAutomation(self,database,browser="chrome"):
if browser=="firefox":
self.driver = webdriver.Firefox()
elif browser=="chrome":
self.driver = webdriver.Chrome(config["chromepath"])
else:
self.driver = webdriver.PhantomJS(config["phantomjspath"])
for joblink in database.jobDescriptions.find({"processFlag": "false"}, {"_id": 1},
no_cursor_timeout=True).limit(1000):
self.jobDescUrl = joblink["_id"]
self.logger.info("job description url - %s",self.jobDescUrl)
self.driver.get(self.jobDescUrl)
time.sleep(random.randint(5, 20))
self.currenturl = self.driver.current_url
self.logger.info("current url - %s",self.currenturl)
if "www.naukri.com" in self.currenturl:
self.pagesource = self.driver.page_source
########## calling function to extract the data #############
FullDesc = self.descriptionExtraction(page=self.pagesource)
if FullDesc:
try:
self.logger.info("updating job description - %s",
database.update({"_id": joblink["_id"]}, {"$set": {"full_desc": FullDesc,
"processFlag": "true"}}))
except Exception as e:
self.logger.fatal("exception in updating job description - %s",e)
pass
else:
self.logger.fatal("the page has been re-directed")
self.logger.info("updating job description - %s",
database.update({"_id": joblink["_id"]}, {"$set": {"processFlag": "reDirected"}}))
def descriptionExtraction(self, page):
Page = BeautifulSoup(page, "lxml")
try:
description = Page.find(config["job_description"]["name"], config["job_description"]["attrs"])
return description.text
except Exception as e:
self.logger.fatal("exception in job description - %s",e)
return "error"
if __name__ == '__main__':
from pymongo import MongoClient
db = MongoClient("localhost",27017)["jd-scraper"]["naukri.com"]
classcall = job_link_scraping().linksAutomation(db,keyword="java developer",location="",browser="chrome")
# page = open("/home/anurag/Desktop/shine_description.html", "r").read()
# classcall = job_link_scraping().linksExtraction(pagesource=page)
classcall2 = job_description_scraping().descriptionAutomation(db,browser="chrome")
print("classcall",classcall)