-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapURI.py
121 lines (94 loc) · 3.37 KB
/
scrapURI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# import requests
# import pandas as pd
# from tqdm import tqdm
# def fetch_uri(href):
# endpoint = f"https://boris.unibe.ch/cgi/exportview/contributors_bern/{href}/JSON/{href}.js"
# response = requests.get(endpoint)
# if response.status_code == 200:
# data = response.json()
# uris = [item.get("uri") for item in data]
# return uris
# else:
# return []
# def main():
# input_csv = "resultDbMain.csv"
# output_csv = "uri2.csv"
# df = pd.read_csv(input_csv)
# href_list = df["href"].tolist()
# unique_uris = set()
# for href in tqdm(href_list, desc="Fetching URIs"):
# name = href.replace(".html", "")
# uris = fetch_uri(name)
# unique_uris.update(uris)
# uri_df = pd.DataFrame({"uri": list(unique_uris)})
# uri_df.to_csv(output_csv, index=False)
# print(f"Unique URIs saved to {output_csv}")
# if __name__ == "__main__":
# main()
# import requests
# import pandas as pd
# from concurrent.futures import ThreadPoolExecutor, as_completed
# from tqdm import tqdm
# MAX_THREADS = 20
# def fetch_uri(href):
# endpoint = f"https://boris.unibe.ch/cgi/exportview/contributors_bern/{href}/JSON/{href}.js"
# response = requests.get(endpoint)
# if response.status_code == 200:
# data = response.json()
# uris = [item.get("uri") for item in data]
# return uris
# else:
# return []
# def main():
# input_csv = "resultDbMain.csv"
# output_csv = "uri.csv"
# df = pd.read_csv(input_csv)
# href_list = df["href"].tolist()
# unique_uris = set()
# with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
# futures = [executor.submit(fetch_uri, href.replace(".html", "")) for href in href_list]
# for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching URIs"):
# uris = future.result()
# unique_uris.update(uris)
# uri_df = pd.DataFrame({"uri": list(unique_uris)})
# uri_df.to_csv(output_csv, index=False)
# print(f"Unique URIs saved to {output_csv}")
# if __name__ == "__main__":
# main()
import requests
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
def fetch_uri(href):
endpoint = f"https://boris.unibe.ch/cgi/exportview/contributors_bern/{href}/JSON/{href}.js"
response = requests.get(endpoint)
if response.status_code == 200:
data = response.json()
uris = [item.get("uri") for item in data]
return uris
else:
return []
def process_institute(href, institute):
uris = fetch_uri(href.replace(".html", ""))
result = [{"uri": uri, "institute": institute} for uri in uris]
return result
def main():
input_csv = "resultDbMain.csv"
output_csv = "uri.csv"
df = pd.read_csv(input_csv)
href_list = df["href"].tolist()
institute_list = df["institute"].tolist()
unique_uris = []
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(tqdm(
executor.map(process_institute, href_list, institute_list),
total=len(href_list),
desc="Fetching URIs"
))
for result in results:
unique_uris.extend(result)
uri_df = pd.DataFrame(unique_uris)
uri_df.to_csv(output_csv, index=False)
print(f"URIs with institutes saved to {output_csv}")
if __name__ == "__main__":
main()