-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
118 lines (99 loc) · 3.49 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from dateutil import parser
from flask import Flask, request, Response
from feedgen.feed import FeedGenerator
from unidecode import unidecode
import requests
import json
import re
app = Flask(__name__)
# This mapping doesn't need to exist, but it does make the app more robust, i.e., url.com/feed/EngrXiv would work
# even though the actual term is engriXiv.
# TODO: SHARE reallly just needs to make these terms case-insensitive.
services_list = [
'engrXiv',
'PsyArXiv',
'SocArXiv',
'BITSS',
'AgriXiv',
'LawArXiv'
]
services = { service.lower():service for service in services_list }
def valid_xml(text):
# I added unidecode to the following
# https://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
return re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+', '', unidecode(text))
def osf_url(urls, service):
r = re.compile('preprints/{0}/'.format(service.lower()), re.IGNORECASE)
for url in urls:
if 'osf' in url:
return r.sub('', url)
return ''
def build_feed(url, service):
fg = FeedGenerator()
fg.id('http://osf.io/preprints/{0}'.format(service.lower()))
fg.title('{0} Preprints'.format(service))
fg.author({'name': service})
fg.link(href=url, rel='self')
fg.link(href='https://osf.io/preprints/{0}'.format(service.lower()), rel='alternate')
fg.subtitle('Preprints submitted to {0} at https://osf.io/preprints/{1}'.format(service, service.lower()))
headers = {'Content-Type': 'application/json'}
response = requests.post(
'https://share.osf.io/api/v2/search/creativeworks/_search',
headers=headers,
data=json.dumps({
"query": {
"bool": {
"must": {
"query_string": {
"query": "*"
}
},
"filter": [
{
"term": {
"sources": service
},
},
{
"term": {
"types": "preprint"
}
}
]
}
},
"from": 0,
"size": 50,
"sort": {
"date_created": "desc"
}
})
)
entries = response.json()['hits']['hits']
for entry in entries:
fe = fg.add_entry()
fe.title(valid_xml(entry['_source']['title']))
fe.description(valid_xml(entry['_source']['description']))
urls = entry['_source']['identifiers']
link_url = osf_url(urls, service)
fe.link(href=link_url)
fe.id(link_url)
# IFTTT doesn't seem to respect guid, so I'm setting the build date as the latest
# entry. This should work becasue we sort by date_created.
lastBuildDate = entries[0]['_source']['date_created']
fg.lastBuildDate(parser.parse(lastBuildDate))
return fg
@app.route("/")
def index():
return "Feeds"
@app.route("/<service>.rss")
def rss(service=None):
lowercase_service = service.lower()
if lowercase_service in services:
service = services[lowercase_service]
fg = build_feed(request.url, service)
response = Response(fg.rss_str(pretty=True))
response.headers['Content-Type'] = 'application/rss+xml'
return response
if __name__ == "__main__":
app.run()