-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_workinghours.py
186 lines (145 loc) · 6.65 KB
/
pdf_workinghours.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 17 11:02:55 2020
@author: Moon
"""
import pandas as pd
dfA=pd.read_csv("Rides_DataA.csv")
dfB=pd.read_csv("Rides_DataB.csv")
#combine data dfA and dfB
df=dfA.merge(dfB,left_on="RIDE_ID",right_on='RIDE_ID')
df_2=df[['RIDE_ID', 'started_on', 'completed_on', 'start_location_long', 'start_location_lat',
'distance_travelled', 'end_location_lat', 'end_location_long','active_driver_id','rider_id',
'base_fare', 'total_fare', 'rate_per_mile', 'rate_per_minute',
'time_fare','driving_time_to_rider','driver_id', 'car_id','make', 'model', 'year']]
#convert to date time
from datetime import datetime
df_2['started_on']=pd.to_datetime(df_2['started_on'],utc=True) #convert object to date format
df_2['start_date']=df_2['started_on'].dt.date
df_2['start_hr']=(df_2['started_on'].dt.hour)+(df_2['started_on'].dt.minute)/60
df_2['completed_on']=pd.to_datetime(df_2['completed_on'],utc=True) #convert object to date format
df_2['complete_hr']=(df_2['completed_on'].dt.hour)+(df_2['completed_on'].dt.minute)/60
df_3=df_2.sort_values(by=['driver_id','start_date'])
df_3=df_3.reset_index()
df_3['start_date']=(df_3['start_date']).astype(str)
iteration=0
# =============================================================================
# end_time_drivers=[]
# start_time_drivers=[]
# duration=[]
# #for driver_id in df_2['driver_id']:
# =============================================================================
#find the break time which is >2.2 hrs
list1=[]
list2=[]
list3=[]
iteration=0
b=int(0)
for i in range(0,1494125):
if (df_3['driver_id'][i]==df_3['driver_id'][i+1]) and (df_3['start_date'][i]==df_3['start_date'][i+1]):
iteration=iteration+1
print("iteration: ",iteration)
print("i: ",i)
#start_time_drivers.append(df_3['start_hr'][i])
if (df_3['start_hr'][i+1]-df_3['complete_hr'][i]<2.2):
b=b+0
else: b=b+df_3['start_hr'][i+1]-df_3['complete_hr'][i]
print("b: ",b)
#end_time_drivers.append(df_3['complete_hr'][i+1])
if b!=0:
list1.append(df_3['driver_id'][i+1])
list2.append(df_3['start_date'][i+1])
list3.append(b)
else: continue
df_4 = pd.DataFrame(list(zip(list1, list2,list3)),
columns =['driver_id', 'date','breaktime'])
df_4.drop_duplicates()
df_4.insert(loc=3,column="breaktime_noncum",value=None)
for i in range (1,1261142):
df_4['breaktime_noncum'][i]=(df_4['breaktime'][i]-df_4['breaktime'][i-1])
print(i)
df_5=df_4.groupby(['driver_id','date'])['breaktime_noncum'].sum().to_frame()
df_5=df_5.reset_index()
df_5.to_csv("df_5_breaktime.csv")
#find min of strat and max of end time to find out extreme points of schedule
working_hr=df_2.groupby(['driver_id','start_date']).agg({
'start_hr':['min'],
'complete_hr':['max']},
)
working_hr=working_hr.reset_index()
#remove one of multi index row and columns
working_hr=working_hr.droplevel(level = 1,axis=1)#second index of row inmulti index removed
#working_hr=working_hr.droplevel(level = 1)#second index of column in multi index removed
working_hr=working_hr.reset_index()
#remove break time from last checkout and checkin time of the day
working_hour_up=pd.merge(working_hr,df_5,how="outer",left_on=['driver_id','start_date'],right_on=['driver_id','date'])
working_hour_up.replace(to_replace=np.nan,value=0,inplace=True)
working_hour_up['duration']=working_hour_up['complete_hr']-working_hour_up\
['start_hr']-working_hour_up['breaktime_noncum']
#just count for each driver how many data available
dri_count=working_hour_up.groupby(['driver_id'])['driver_id'].count()
dri_count=dri_count.to_frame()
dri_count.rename(columns={'driver_id':'#count_available'},inplace=True)
dri_count=dri_count.reset_index()
#Take more than one row otherwise pdf is not started to plot
dri_count_10=dri_count[dri_count['#count_available']>1]
working_hr_merged=working_hour_up.merge(dri_count_10,left_on='driver_id',right_on='driver_id')
working_hr_merged=working_hr_merged[working_hr_merged['duration']>0]
working_hr_merged.to_csv("working_hr_merged.csv")
#IF YOU HAVE RESULTANT CSV YOU START FROM HERE
import pandas as pd
working_hr_merged=pd.read_csv("working_hr_merged.csv")
#positive dureations are taken
import matplotlib.pyplot as plt
#plot for austine data's working hours
working_hr_merged.duration.plot(kind='density')
plt.xlabel('working hrs')
plt.ylabel('pdf')
plt.title("pdf for working hrs from rideaustin data")
#plot for each driver_id
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
for i in working_hr_merged['driver_id'].unique():
working_hr_merged[working_hr_merged['driver_id']==i].duration.plot(ax=ax,kind='density')
plt.title('pdf of working hours of drivers')
plt.xlabel('Working hours')
#find best distribution and statistics/params
import scipy.stats as st
#f0lowwing fucntion copied from stakeoverflow
def get_best_distribution(data):
dist_names = ["norm", "lognorm"]
dist_results = []
params = {}
for dist_name in dist_names:
dist = getattr(st, dist_name)
param = dist.fit(data)
params[dist_name] = param
# Applying the Kolmogorov-Smirnov test
D, p = st.kstest(data, dist_name, args=param)
print("p value for "+dist_name+" = "+str(p))
dist_results.append((dist_name, p))
# select the best fitted distribution
best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
# store the name of the best fit and its p value
print("Best fitting distribution: "+str(best_dist))
print("Best p value: "+ str(best_p))
print("Parameters for the best fit: "+ str(params[best_dist]))
return best_dist, best_p, params[best_dist]
print(get_best_distribution(working_hr_merged['duration']))
#plot the best distributioncurve ,here, normal
#plot pdf of gaussian/normal distribution
bin_li=[]
mu,sigma=3.37,2.93
len(set(driver_id)=4803
s = np.random.normal(mu, sigma, len(set(driver_id)))
# Create the bins and histogram
count, bins, ignored = plt.hist(s, 20)#20 intervals
#bins=if bins is:[1, 2, 3, 4]then the first bin is [1, 2) (including 1, but excluding 2)
#and the second [2, 3). The last bin, however, is [3, 4], which includes 4.,
#counts are values for each interval
bin_li.append(bins)
# Plot the distribution curve
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.plot(bin_li[0], 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bin_li[0] - mu)**2 / (2 * sigma**2)), linewidth=3)
plt.show()