-
Notifications
You must be signed in to change notification settings - Fork 0
/
dblp_parsing.py
186 lines (158 loc) · 6.57 KB
/
dblp_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
import csv
DATA_FILEPATH = "dblp_data/faculty_data - faculty.csv"
PUBLICATION_ITEM_CATEGORIES = {"article", "inproceedings", "proceedings", "book", "incollection", "phdthesis",
"mastersthesis", "www"}
DBLP_XML_FILEPATH = "dblp_data/dblp-2021-04-01.xml"
CSV_OUTPUT_FILEPATH = "dblp_data/processed_publications.csv"
def main():
# year_to_num = year_to_prof()
# print(year_to_num)
parse_xml_for_publications()
return
def year_to_prof(printing=0):
"""Counts {year: num of people with year_of_job of that year}.
Uses: DATA_FILEPATH.
:return: {year: num of people with year_of_job of that year}
"""
with open(DATA_FILEPATH, 'r') as file:
next(file).split(",")
year_to_num = {}
for row in file:
row = row.split(",")
year = -1
for item in row:
try:
year = int(item)
except ValueError:
pass
if year not in year_to_num:
year_to_num[year] = 0
year_to_num[year] += 1
total = 0
for year in sorted(year_to_num):
if printing:
print("{}: {}".format(year, year_to_num[year]))
total += year_to_num[year]
print("year_to_prof: total = {} professors".format(total))
return year_to_num
def parse_xml_for_publications():
"""Parses the DBLP xml file and processes each publication (for memory efficiency).
Uses: DATA_FILEPATH, DBLP_XML_FILEPATH, PUBLICATION_ITEM_CATEGORIES, CSV_OUTPUT_FILEPATH.
:return: None
"""
if os.path.isfile(CSV_OUTPUT_FILEPATH):
raise ValueError("File at CSV_OUTPUT_FILEPATH already exists: please delete the file to start parsing")
with open(DBLP_XML_FILEPATH, 'r') as file:
# Start with the first line of the file:
line = next(file)
print("Start: {}".format(line))
# Non-publication item beginning with "<" (should be 3 words and "" or "\n"):
non_publication_items = []
# Search for "<[publication_item]":
while True:
if "<" not in line:
try:
line = next(file)
continue
except StopIteration:
print("Finished parsing the file")
print("non_publication_items = {}".format(non_publication_items))
return
else:
# Cases of location:
# (i) "...<[publication_item] ...>"
# (ii) "<[publication_item] ...>"
index_of_arrow = line.find("<")
# line = "<[publication_item] ...>"
line = line[index_of_arrow:]
# Check if it's a valid publication item:
publication_item = line[1:].split(" ")[0]
if publication_item not in PUBLICATION_ITEM_CATEGORIES:
non_publication_items.append(publication_item)
line = line[len(publication_item) + 1:]
continue
publication = []
# Once found and legal, add to publication all strings from "[publication_item] ..."
# to "</publication_item>"
closing_string = "</{}>".format(publication_item)
while closing_string not in line:
publication.append(line)
line = next(file)
begin_index = line.find(closing_string)
end_index = begin_index + len(closing_string)
publication.append(line[:end_index])
# Process the publication:
process_publication(publication)
# Update line to what's right after "</publication_item>", even if it's "":
line = line[end_index:]
return
def process_publication(publication):
"""
Helper function that processes the publication.
Uses: CSV_OUTPUT_FILEPATH, PUBLICATION_ITEM_CATEGORIES.
:param publication: a list of strings.
:return: None
"""
if not os.path.isfile(CSV_OUTPUT_FILEPATH):
with open(CSV_OUTPUT_FILEPATH, 'w') as output_file:
fieldnames = ["type", "year", "number_of_authors", "author", "title"]
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
writer.writeheader()
with open(CSV_OUTPUT_FILEPATH, 'a') as output_file:
user_obj_writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
# Induce year:
years = []
for line in publication:
while "<year>" in line:
i_begin = line.find("<year>") + len("<year>")
i_end = line.find("</year>")
year = int(line[i_begin:i_end])
line = line[i_end + len("</year>"):]
years.append(year)
if len(years) == 0:
year = None
else:
year = years
# Induce title:
titles = []
for line in publication:
while "<title>" in line:
i_begin = line.find("<title>") + len("<title>")
i_end = line.find("</title>")
title = line[i_begin:i_end]
line = line[i_end + len("</title>"):]
titles.append(title)
if len(titles) == 0:
title = None
else:
title = titles
# Induce type:
publication_type = publication[-1][2:-1]
if publication_type not in PUBLICATION_ITEM_CATEGORIES:
raise ValueError("Processed publication type is not in PUBLICATION_ITEM_CATEGORIES")
# Induce the number of authors:
num_of_auth = 0
for line in publication:
num_of_auth += line.count("<author")
if num_of_auth == 0:
author = None
else:
# Induce author:
authors = []
for line in publication:
while "<author" in line:
i_begin = line.find("<author") + len("<author")
i_end = line.find("</author>")
author = line[i_begin:i_end]
author = author.split(">")[1]
authors.append(author)
line = line[i_end + len("</author>"):]
author = authors
if len(author) != num_of_auth:
raise ValueError("len(author) = {}, while num_of_auth = {}".format(len(author), num_of_auth))
row = [publication_type, year, num_of_auth, author, title]
user_obj_writer.writerow(row)
return
if __name__ == '__main__':
main()