-
Notifications
You must be signed in to change notification settings - Fork 0
/
prototype.py
55 lines (47 loc) · 1.62 KB
/
prototype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
import re
import urllib.request as urllib2
from collections import defaultdict
from bs4 import BeautifulSoup
base = "https://cesd3.oit.umass.edu/undergradguide/2017-2018/"
info = "Page12331.html"
courseinfo = urllib2.urlopen(base + info).read()
soup = BeautifulSoup(courseinfo, "lxml")
majors = []
for li in soup.find_all('li'):
c = li.get('class')
if c is not None:
if c[0] == "catalognavigationmenu-chapter":
majors.append((li.a.contents[0], li.a.get('href')))
ml = []
for m, u in majors:
major = urllib2.urlopen(base + u).read()
ml.append((m, BeautifulSoup(major, "lxml")))
courses = []
for m, s in ml:
for li in s.find_all('li'):
if li.get('class') is not None:
if li.get('class')[0] == "catalognavigationmenu-topicgroup":
if li.a.contents[0] == "The Courses":
courses.append((m, li.a.get('href')))
mc = []
for m, u in courses:
courses = urllib2.urlopen(base + u).read()
mc.append((m, BeautifulSoup(courses, "lxml")))
classes = defaultdict(lambda: [])
for m, s in mc:
temp = []
for strong in s.find_all('strong'):
n = re.findall(r'((^|\s)[0-9][0-9][0-9]\s)|([0-9][0-9][0-9][A-Z|a-z]+(\s|$))|([0-9][0-9][0-9][A-Z|a-z]+[0-9])',
strong.text)
for c in n:
for j in c:
if j != '\n' and j != '':
temp.append(j.rstrip('\n').rstrip(' ').rstrip('\xa0'))
for t in temp:
if len(t) > 1:
classes[m].append(t.strip())
r = json.dumps(classes)
js = json.loads(r)
with open('fetched.json', 'w') as outfile:
json.dump(js, outfile)