-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scrapping_remacs.py
264 lines (198 loc) · 11.5 KB
/
Scrapping_remacs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import praw #importing Python Reddit API Wrapper
from psaw import PushshiftAPI #from Pushshift.io API Wrapper importing the API to extract the posts to search public reddit comments and submissions
import datetime as dt # This is used to extract the time stamp
import pandas as pd # this is used to create the dataframe
from google.colab import drive # This is used to connect with the google drive to save the generated datasets
drive.mount('/gdrive/') #Mounting Google Drive
#A Reddit instance is created and provided it with parameters like client_id , client_secret, password and a user_agent. These values are otained while creating an application on reddit
reddit = praw.Reddit(client_id='C0NBhA1qhdkIAg',client_secret='FEHr1Wc9qQ4Ws_NSzQwfPiJQjp0',username='KushMetaheuristic',password='Mithai-1234',user_agent='DrAdams_Task3')
api = PushshiftAPI(reddit)
subreddit = 'emacs' # The name of the subreddit used
#r/emacs
#declaring the data i will be extracting
title=[]
url=[]
upvote_count=[]
downvote_count=[]
comments_count=[]
upvote_ratio=[]
overall_vote=[]
subreddit_name=[]
month=[]
day=[]
id=[]
total_awards_received=[]
#starting from January (Month: j = 1)
#A loop iteration technique has been implied because I found that the number of post extraction in a single run is limited by the APIs (MAX 1000 posts in one go).
#I have assumed that there can not be more than 1000 posts in a single day by any subreddit. So the loop iteration technique iterates daywise.
j=1
#The loop will iterate up to March: J=3
while j <= 3:
#JANUARY 2020
if j == 1:
print('Working on January')
#THIS LOOP WILL ITERATE THE DAYS AND WILL COLLECT DATA FROM 1ST JAN 2020 TO 30TH JAN 2020
for i in range(2,32):
print(i-1)
st_i =i-1 #START DATE: st_i stands for start day index
start_epoch=int(dt.datetime(2020, 1, st_i).timestamp()) #GETTING THE TIME-STAMP OF THE START DATE 2020/01
end_epoch=int(dt.datetime(2020, j, i).timestamp())
#NOW EXTRACTING THE SUBMISSION CLUSTERS
submission_cluster=list(api.search_submissions(after=start_epoch, before=end_epoch, subreddit=subreddit,filter=['url','author', 'title', 'subreddit'],limit=None))
#ITERATING OVER THE EXTRACTED CLUSTERS TO EXTRACT INDIVIDUAL DATA
for k in submission_cluster:
#APPENDING DATA INTO THE LISTS
id.append(k.id) #EXTRACTING THE IDENTIFIER
title.append(k.title) #TAKING THE TITLE
url.append(k.url) #URL FOR REFERENCE
upvote_count.append(k.ups) #UPVOTE COUNT
downvote_count.append(k.downs) #DOWNVOTE COUNT
overall_vote.append(k.ups-k.downs) #DIFFERENCE BETWEEN UPVOTE AND DOWNVOTE COUNTS
upvote_ratio.append(k.upvote_ratio) #UPVOTE RATIO
comments_count.append(len(k.comments)) #NUMBER OF COMMENTS
subreddit_name.append(k.subreddit) #NAME OF THE SUBREDDIT: vim
total_awards_received.append(k.total_awards_received) #NUMBER OF AWARDS RECEIVED
month.append('Jan') # MONTH
day.append(st_i) #DAY
#EXTRACTING THE DATA OF 31ST JAN 2020 (LAST DAY)
print('Last Day Jan...')
st_i =st_i+1 # 31ST JAN: st_i stands for start day index
i=1 # 1ST FEB
start_epoch=int(dt.datetime(2020, 1, st_i).timestamp()) #GETTING THE TIME-STAMP OF THE START DATE 2020/01/31
end_epoch=int(dt.datetime(2020, j+1, i).timestamp()) # j= 2, i=1 (1ST FEB): GETTING THE TIME-STAMP OF THE END DATE 2020/02/01
#NOW EXTRACTING THE SUBMISSION CLUSTERS AGAIN
submission_cluster=list(api.search_submissions(after=start_epoch, before=end_epoch, subreddit=subreddit,filter=['url','author', 'title', 'subreddit'],limit=None))
#ITERATING OVER THE EXTRACTED CLUSTERS TO EXTRACT INDIVIDUAL DATA AGAIN
for k in submission_cluster:
#APPENDING DATA INTO THE LISTS FOR THE LAST DAY IN JAN
id.append(k.id) #EXTRACTING THE IDENTIFIER
title.append(k.title) #TAKING THE TITLE
url.append(k.url) #URL FOR REFERENCE
upvote_count.append(k.ups) #UPVOTE COUNT
downvote_count.append(k.downs) #DOWNVOTE COUNT
overall_vote.append(k.ups-k.downs) #DIFFERENCE BETWEEN UPVOTE AND DOWNVOTE COUNTS
upvote_ratio.append(k.upvote_ratio) #UPVOTE RATIO
comments_count.append(len(k.comments)) #NUMBER OF COMMENTS
subreddit_name.append(k.subreddit) #NAME OF THE SUBREDDIT: vim
total_awards_received.append(k.total_awards_received) #NUMBER OF AWARDS RECEIVED
month.append('Jan') #MONTH
day.append(st_i) #LAST DAY
j+=1 #INCREMENTING THE MONTH
#FEBRUARY 2020
if j==2:
print('Working on February')
#THIS LOOP WILL ITERATE THE DAYS AND WILL COLLECT DATA FROM 1ST FEB 2020 TO 28TH FEB 2020
for i in range(2,30):
print(i-1)
st_i =i-1 #START DATE: st_i stands for start day index
start_epoch=int(dt.datetime(2020, 1, st_i).timestamp()) #GETTING THE TIME-STAMP OF THE START DATE 2020/02
end_epoch=int(dt.datetime(2020, j, i).timestamp())
#NOW EXTRACTING THE SUBMISSION CLUSTERS
submission_cluster=list(api.search_submissions(after=start_epoch, before=end_epoch, subreddit=subreddit,filter=['url','author', 'title', 'subreddit'],limit=None))
#ITERATING OVER THE EXTRACTED CLUSTERS TO EXTRACT INDIVIDUAL DATA
for k in submission_cluster:
#APPENDING DATA INTO THE LISTS
id.append(k.id) #EXTRACTING THE IDENTIFIER
title.append(k.title) #TAKING THE TITLE
url.append(k.url) #URL FOR REFERENCE
upvote_count.append(k.ups) #UPVOTE COUNT
downvote_count.append(k.downs) #DOWNVOTE COUNT
overall_vote.append(k.ups-k.downs) #DIFFERENCE BETWEEN UPVOTE AND DOWNVOTE COUNTS
upvote_ratio.append(k.upvote_ratio) #UPVOTE RATIO
comments_count.append(len(k.comments)) #NUMBER OF COMMENTS
subreddit_name.append(k.subreddit) #NAME OF THE SUBREDDIT: vim
total_awards_received.append(k.total_awards_received) #NUMBER OF AWARDS RECEIVED
month.append('Feb') #MONTH
day.append(st_i) #DAY
#EXTRACTING THE DATA OF 31ST JAN 2020 (LAST DAY)
print('Last Day Feb...')
st_i =st_i+1 # 29Th FEB: st_i stands for start day index
i=1 # 1ST MAR
start_epoch=int(dt.datetime(2020, 1, st_i).timestamp()) #GETTING THE TIME-STAMP OF THE START DATE 2020/02/29
end_epoch=int(dt.datetime(2020, j+1, i).timestamp()) #GETTING THE TIME-STAMP OF THE END DATE 2020/03/01
#NOW EXTRACTING THE SUBMISSION CLUSTERS AGAIN
submission_cluster=list(api.search_submissions(after=start_epoch, before=end_epoch, subreddit=subreddit,filter=['url','author', 'title', 'subreddit'],limit=None))
#ITERATING OVER THE EXTRACTED CLUSTERS TO EXTRACT INDIVIDUAL DATA AGAIN
for k in submission_cluster:
#APPENDING DATA INTO THE LISTS FOR THE LAST DAY IN FEB
id.append(k.id) #EXTRACTING THE IDENTIFIER
title.append(k.title) #TAKING THE TITLE
url.append(k.url) #URL FOR REFERENCE
upvote_count.append(k.ups) #UPVOTE COUNT
downvote_count.append(k.downs) #DOWNVOTE COUNT
overall_vote.append(k.ups-k.downs) #DIFFERENCE BETWEEN UPVOTE AND DOWNVOTE COUNTS
upvote_ratio.append(k.upvote_ratio) #UPVOTE RATIO
comments_count.append(len(k.comments)) #NUMBER OF COMMENTS
subreddit_name.append(k.subreddit) #NAME OF THE SUBREDDIT: vim
total_awards_received.append(k.total_awards_received) #NUMBER OF AWARDS RECEIVED
month.append('Feb') #MONTH
day.append(st_i) #LAST DAY
j+=1 #INCREMENTING THE MONTH
#MARCH 2020
if j==3:
print('Working on March')
#THIS LOOP WILL ITERATE THE DAYS AND WILL COLLECT DATA FROM 1ST MAR 2020 TO 30TH MAR 2020
for i in range(7,32):
print(i-1)
st_i =i-1 #START DATE: st_i stands for start day index
start_epoch=int(dt.datetime(2020, 1, st_i).timestamp()) #GETTING THE TIME-STAMP OF THE START DATE 2020/03
end_epoch=int(dt.datetime(2020, j, i).timestamp())
#NOW EXTRACTING THE SUBMISSION CLUSTERS
submission_cluster=list(api.search_submissions(after=start_epoch, before=end_epoch, subreddit=subreddit,filter=['url','author', 'title', 'subreddit'],limit=None))
#ITERATING OVER THE EXTRACTED CLUSTERS TO EXTRACT INDIVIDUAL DATA
for k in submission_cluster:
#APPENDING DATA INTO THE LISTS
id.append(k.id) #EXTRACTING THE IDENTIFIER
title.append(k.title) #TAKING THE TITLE
url.append(k.url) #URL FOR REFERENCE
upvote_count.append(k.ups) #UPVOTE COUNT
downvote_count.append(k.downs) #DOWNVOTE COUNT
overall_vote.append(k.ups-k.downs) #DIFFERENCE BETWEEN UPVOTE AND DOWNVOTE COUNTS
upvote_ratio.append(k.upvote_ratio) #UPVOTE RATIO
comments_count.append(len(k.comments)) #NUMBER OF COMMENTS
subreddit_name.append(k.subreddit) #NAME OF THE SUBREDDIT: vim
total_awards_received.append(k.total_awards_received) #NUMBER OF AWARDS RECEIVED
month.append('Mar') #MONTH
day.append(st_i) #DAY
#EXTRACTING THE DATA OF 31ST MAR 2020 (LAST DAY)
print('Last Day Mar...')
st_i =st_i+1 # 31ST MAR: st_i stands for start day index
i=1 # 1ST APR
start_epoch=int(dt.datetime(2020, 1, st_i).timestamp()) #GETTING THE TIME-STAMP OF THE START DATE 2020/03/31
end_epoch=int(dt.datetime(2020, j+1, i).timestamp()) #GETTING THE TIME-STAMP OF THE END DATE 2020/04/01
#NOW EXTRACTING THE SUBMISSION CLUSTERS AGAIN
submission_cluster=list(api.search_submissions(after=start_epoch, before=end_epoch, subreddit=subreddit,filter=['url','author', 'title', 'subreddit'],limit=None))
#ITERATING OVER THE EXTRACTED CLUSTERS TO EXTRACT INDIVIDUAL DATA AGAIN
for k in submission_cluster:
#APPENDING DATA INTO THE LISTS FOR THE LAST DAY IN MAR
id.append(k.id) #EXTRACTING THE IDENTIFIER
title.append(k.title) #TAKING THE TITLE
url.append(k.url) #URL FOR REFERENCE
upvote_count.append(k.ups) #UPVOTE COUNT
downvote_count.append(k.downs) #DOWNVOTE COUNT
overall_vote.append(k.ups-k.downs) #DIFFERENCE BETWEEN UPVOTE AND DOWNVOTE COUNTS
upvote_ratio.append(k.upvote_ratio) #UPVOTE RATIO
comments_count.append(len(k.comments)) #NUMBER OF COMMENTS
subreddit_name.append(k.subreddit) #NAME OF THE SUBREDDIT: vim
total_awards_received.append(k.total_awards_received) #NUMBER OF AWARDS RECEIVED
month.append('Mar') #MONTH
day.append(st_i) #LAST DAY
j+=1 #INCREMENTING THE MONTH
#CONVERTING THE LISTS INTO INDIVIDUAL SINGLE-COLUMNED DATAFRAMES
id_df=pd.DataFrame(id,columns=['identifier'])
title_df=pd.DataFrame(title,columns=['title'])
url_df=pd.DataFrame(url,columns=['url'])
upvote_count_df=pd.DataFrame(upvote_count,columns=['upvote count'])
downvote_count_df=pd.DataFrame(downvote_count,columns=['downvote count'])
comments_count_df=pd.DataFrame(comments_count,columns=['comments count'])
upvote_ratio_df=pd.DataFrame(upvote_ratio,columns=['upvote ratio'])
overall_vote_df=pd.DataFrame(overall_vote,columns=['overall vote'])
subreddit_name_df=pd.DataFrame(subreddit_name,columns=['subreddit'])
total_awards_received_df=pd.DataFrame(total_awards_received,columns=['total awards'])
month_df=pd.DataFrame(month,columns=['month'])
day_df=pd.DataFrame(day,columns=['day'])
#CONCATING THE INDIVIDUAL DATAFRAMES BY SETTING THE AXIS AS 1 TO BUILD ONE SINGLE DATAFRAME WITH MULTIPLE FEATURES
emacs_df=pd.concat([id_df,title_df,url_df,upvote_count_df,downvote_count_df,comments_count_df,upvote_ratio_df,overall_vote_df,subreddit_name_df,total_awards_received_df,month_df,day_df],axis=1)
emacs_df
#CONVERTING TO CSV
emacs_df.to_csv('/gdrive/My Drive/DrAdams_Task3_Files/CSV/emacs_finalDF.csv')