#Reading Data
BOPS<-read.csv("Monthly Sales Data_Final.csv")
summary(BOPS)
## X store_number summary month_index
## Min. : 1.0 Min. : 2 Min. : 1.000 Min. :13.0
## 1st Qu.: 270.2 1st Qu.: 2 1st Qu.: 4.000 1st Qu.:19.0
## Median : 539.5 Median : 6 Median : 9.000 Median :25.0
## Mean : 539.5 Mean :1677 Mean : 9.356 Mean :25.4
## 3rd Qu.: 808.8 3rd Qu.:5998 3rd Qu.:13.000 3rd Qu.:31.0
## Max. :1078.0 Max. :5998 Max. :21.000 Max. :37.0
##
## monthly_sales BOPS month year
## Min. : 19 Min. :0.0000 AUG :122 Min. :2010
## 1st Qu.: 9746 1st Qu.:0.0000 DEC : 92 1st Qu.:2011
## Median : 29009 Median :0.0000 JAN : 90 Median :2011
## Mean : 166785 Mean :0.3738 JUL : 90 Mean :2011
## 3rd Qu.: 174640 3rd Qu.:1.0000 JUN : 90 3rd Qu.:2012
## Max. :4727543 Max. :1.0000 FEB : 89 Max. :2012
## (Other):505
## treatment bridal
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000
## Mean :0.7208 Mean :0.4007
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
##
Return<-read.csv("Transaction data merged.csv")
summary(Return)
## X customer_id purchase_date
## Min. : 1 Min. :1.003e+05 13DEC2012:00:00:00: 12989
## 1st Qu.: 417876 1st Qu.:2.569e+07 12DEC2012:00:00:00: 12286
## Median : 835752 Median :3.015e+07 18DEC2012:00:00:00: 12234
## Mean : 835752 Mean :2.944e+10 06DEC2012:00:00:00: 11614
## 3rd Qu.:1253627 3rd Qu.:3.499e+07 11DEC2012:00:00:00: 10228
## Max. :1671502 Max. :9.197e+11 22DEC2011:00:00:00: 9896
## (Other) :1602255
## transaction_id store_number net_purchase_amount sku
## Min. : 7198 Min. : 2.0 Min. : 0.00 Min. :10033405
## 1st Qu.:2649367 1st Qu.: 2.0 1st Qu.: 45.13 1st Qu.:17859869
## Median :3355444 Median : 2.0 Median : 90.00 Median :18126698
## Mean :3338930 Mean : 229.5 Mean : 170.33 Mean :18300993
## 3rd Qu.:4030256 3rd Qu.: 2.0 3rd Qu.: 189.00 3rd Qu.:18584417
## Max. :4702552 Max. :5998.0 Max. :39422.00 Max. :80006100
##
## return return_date return_store
## Min. :0.000 :1502656 Min. : 2.0
## 1st Qu.:0.000 02JUN2011:00:00:00: 921 1st Qu.: 2.0
## Median :0.000 21JUL2011:00:00:00: 803 Median : 2.0
## Mean :0.101 25APR2012:00:00:00: 802 Mean : 756.7
## 3rd Qu.:0.000 10JAN2013:00:00:00: 781 3rd Qu.:1479.0
## Max. :1.000 08JAN2013:00:00:00: 744 Max. :5998.0
## (Other) : 164795 NA's :1502656
## time_to_return gender age_band est_income_code
## Min. : 0.0 :210507 Min. : 0.00 Min. :1.00
## 1st Qu.: 8.0 F:693941 1st Qu.: 0.00 1st Qu.:4.00
## Median : 17.0 M:719292 Median : 5.00 Median :6.00
## Mean : 27.8 U: 47762 Mean : 4.95 Mean :5.42
## 3rd Qu.: 33.0 3rd Qu.: 8.00 3rd Qu.:7.00
## Max. :1679.0 Max. :13.00 Max. :9.00
## NA's :1502656 NA's :72038 NA's :68664
## ethnic_code homeowner_code length_of_residence child
## N :615387 : 68664 Min. : 0.00 : 68664
## S :204830 O:1064498 1st Qu.: 2.00 N:977479
## H :187647 R: 538340 Median : 6.00 Y:625359
## Z :161313 Mean : 7.14
## G :120011 3rd Qu.:13.00
## U : 71027 Max. :15.00
## (Other):311287 NA's :68664
## year month month_index summary
## Min. :2010 DEC :425369 Min. :13.00 Min. : 1.00
## 1st Qu.:2011 NOV :187468 1st Qu.:22.00 1st Qu.: 5.00
## Median :2012 FEB :179716 Median :31.00 Median : 9.00
## Mean :2012 MAY :143596 Mean :31.47 Mean :10.07
## 3rd Qu.:2012 JAN :127777 3rd Qu.:41.00 3rd Qu.:12.00
## Max. :2013 APR :102339 Max. :48.00 Max. :21.00
## (Other):505237 NA's :4
## fy12 fy13
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.05689 Mean :0.1095
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000
##
#Cleaning Data
return<-Return[!(is.na(Return$age_band)),]
return<-return[!(is.na(return$est_income_code)),]
return<-return[!(is.na(return$length_of_residence)),]
return<-return[!(is.na(return$summary)),]
return<-return[!(return$gender=="U"),]
return<-return[!(return$net_purchase_amount==0),]
return$gender<-ifelse(return$gender=="M",1,0)
return$homeowner_code<-ifelse(return$homeowner_code=="O",1,0)
return$child<-ifelse(return$child=="Y",1,0)
return$BOPS<-return$fy12+return$fy13
return$X<-NULL
return[is.na(return)] <- " "
#return$age_band<-as.numeric(Return$age_band)
#return$est_income_code<-as.numeric(Return$est_income_code)
#return$length_of_residence<-as.numeric(Return$length_of_residence)
#return$summary<-as.numeric(Return$summary)
return$Cgroup<-ifelse(return$summary==1,1,
ifelse(return$summary==2,1,
ifelse(return$summary==20,1,
ifelse(return$summary==3,2,
ifelse(return$summary==4,2,
ifelse(return$summary==7,2,
ifelse(return$summary==11,2,
ifelse(return$summary==12,2,
ifelse(return$summary==15,2,
ifelse(return$summary==5,3,
ifelse(return$summary==21,3,
4)))))))))))
return$Month<-ifelse(return$month=="JAN",1,
ifelse(return$month=="FEB",2,
ifelse(return$month=="MAR",3,
ifelse(return$month=="APR",4,
ifelse(return$month=="MAY",5,
ifelse(return$month=="JUN",6,
ifelse(return$month=="JUL",7,
ifelse(return$month=="AUG",8,
ifelse(return$month=="SEP",9,
ifelse(return$month=="OCT",10,
ifelse(return$month=="NOV",11,
12)))))))))))
#Summary
hist(return$net_purchase_amount)
Please refer to the Net Purchase Amount histogram.
hist(log(return$net_purchase_amount)) #use log
return$lognet_purchase_amount<-log(return$net_purchase_amount)
cor(return$homeowner_code,return$est_income_code) #0.35
## [1] 0.355759
summary(return)
## customer_id purchase_date transaction_id
## Min. :1.003e+05 13DEC2012:00:00:00: 12022 Min. : 7206
## 1st Qu.:2.581e+07 12DEC2012:00:00:00: 11582 1st Qu.:2639234
## Median :3.015e+07 18DEC2012:00:00:00: 11138 Median :3316969
## Mean :3.060e+10 06DEC2012:00:00:00: 10670 Mean :3330541
## 3rd Qu.:3.490e+07 22DEC2011:00:00:00: 9382 3rd Qu.:4002518
## Max. :9.197e+11 11DEC2012:00:00:00: 9338 Max. :4702379
## (Other) :1441842
## store_number net_purchase_amount sku
## Min. : 2.000 Min. : 0.01 Min. :10033405
## 1st Qu.: 2.000 1st Qu.: 49.00 1st Qu.:17856907
## Median : 2.000 Median : 94.99 Median :18118190
## Mean : 4.437 Mean : 174.07 Mean :18282413
## 3rd Qu.: 2.000 3rd Qu.: 190.48 3rd Qu.:18559310
## Max. :5998.000 Max. :39422.00 Max. :40005720
##
## return return_date return_store
## Min. :0.0000 :1349914 Length:1505974
## 1st Qu.:0.0000 02JUN2011:00:00:00: 900 Class :character
## Median :0.0000 21JUL2011:00:00:00: 792 Mode :character
## Mean :0.1036 25APR2012:00:00:00: 761
## 3rd Qu.:0.0000 10JAN2013:00:00:00: 730
## Max. :1.0000 08JAN2013:00:00:00: 713
## (Other) : 152164
## time_to_return gender age_band est_income_code
## Length:1505974 Min. :0.0000 Min. : 0.000 Min. :1.000
## Class :character 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:4.000
## Mode :character Median :0.0000 Median : 5.000 Median :6.000
## Mean :0.4624 Mean : 5.071 Mean :5.425
## 3rd Qu.:1.0000 3rd Qu.: 8.000 3rd Qu.:7.000
## Max. :1.0000 Max. :13.000 Max. :9.000
##
## ethnic_code homeowner_code length_of_residence child
## N :583396 Min. :0.0000 Min. : 0.00 Min. :0.0000
## S :194533 1st Qu.:0.0000 1st Qu.: 2.00 1st Qu.:0.0000
## H :174091 Median :1.0000 Median : 6.00 Median :0.0000
## Z :148988 Mean :0.6672 Mean : 7.17 Mean :0.3905
## G :114840 3rd Qu.:1.0000 3rd Qu.:13.00 3rd Qu.:1.0000
## U : 66311 Max. :1.0000 Max. :15.00 Max. :1.0000
## (Other):223815
## year month month_index summary
## Min. :2010 DEC :399820 Min. :13.00 Min. : 1.00
## 1st Qu.:2011 NOV :172176 1st Qu.:22.00 1st Qu.: 5.00
## Median :2012 FEB :167100 Median :31.00 Median : 9.00
## Mean :2012 MAY :129747 Mean :31.22 Mean :10.21
## 3rd Qu.:2012 JAN :108860 3rd Qu.:41.00 3rd Qu.:13.00
## Max. :2013 OCT : 85378 Max. :48.00 Max. :21.00
## (Other):442893
## fy12 fy13 BOPS Cgroup
## Min. :0.00000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:2.000
## Median :0.00000 Median :0.0000 Median :0.000 Median :2.000
## Mean :0.06084 Mean :0.1052 Mean :0.166 Mean :2.471
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:3.000
## Max. :1.00000 Max. :1.0000 Max. :1.000 Max. :4.000
##
## Month lognet_purchase_amount
## Min. : 1.00 Min. :-4.605
## 1st Qu.: 4.00 1st Qu.: 3.892
## Median : 8.00 Median : 4.554
## Mean : 7.51 Mean : 4.589
## 3rd Qu.:12.00 3rd Qu.: 5.250
## Max. :12.00 Max. :10.582
##
xtabs(~ return + BOPS, data = return)
## BOPS
## return 0 1
## 0 1127629 222285
## 1 128361 27699
#Return-OLS
model0<-lm(return~lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(BOPS)+factor(store_number)+factor(summary)+Month,data=return)
summary(model0)
##
## Call:
## lm(formula = return ~ lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(BOPS) + factor(store_number) +
## factor(summary) + Month, data = return)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36080 -0.12107 -0.09116 -0.05901 1.02914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.006e-02 2.857e-03 -10.522 < 2e-16 ***
## lognet_purchase_amount 3.494e-02 3.465e-04 100.830 < 2e-16 ***
## factor(gender)1 -2.767e-02 5.070e-04 -54.581 < 2e-16 ***
## age_band 1.373e-04 6.429e-05 2.136 0.032668 *
## est_income_code 2.274e-03 1.111e-04 20.477 < 2e-16 ***
## factor(BOPS)1 1.477e-02 6.662e-04 22.171 < 2e-16 ***
## factor(store_number)6 -3.415e-03 9.673e-04 -3.531 0.000415 ***
## factor(store_number)5998 -3.637e-02 1.294e-02 -2.811 0.004944 **
## factor(summary)2 4.970e-02 2.118e-03 23.462 < 2e-16 ***
## factor(summary)3 -7.693e-03 2.579e-03 -2.983 0.002855 **
## factor(summary)4 -1.840e-02 1.822e-03 -10.101 < 2e-16 ***
## factor(summary)5 -2.933e-02 1.871e-03 -15.674 < 2e-16 ***
## factor(summary)6 2.371e-02 2.349e-03 10.096 < 2e-16 ***
## factor(summary)7 -4.593e-02 2.251e-03 -20.408 < 2e-16 ***
## factor(summary)8 9.216e-02 9.139e-03 10.084 < 2e-16 ***
## factor(summary)9 -2.054e-02 2.481e-03 -8.278 < 2e-16 ***
## factor(summary)10 1.045e-01 2.379e-02 4.391 1.13e-05 ***
## factor(summary)11 -3.511e-02 2.277e-03 -15.414 < 2e-16 ***
## factor(summary)12 -2.930e-02 1.880e-03 -15.585 < 2e-16 ***
## factor(summary)13 -3.309e-02 2.380e-03 -13.903 < 2e-16 ***
## factor(summary)14 1.110e-02 2.838e-03 3.911 9.20e-05 ***
## factor(summary)15 7.494e-05 5.798e-02 0.001 0.998969
## factor(summary)17 -3.002e-02 5.178e-03 -5.799 6.69e-09 ***
## factor(summary)20 4.188e-02 2.433e-03 17.211 < 2e-16 ***
## factor(summary)21 -2.646e-02 1.905e-03 -13.892 < 2e-16 ***
## Month -1.138e-03 6.194e-05 -18.365 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3011 on 1505948 degrees of freedom
## Multiple R-squared: 0.02381, Adjusted R-squared: 0.02379
## F-statistic: 1469 on 25 and 1505948 DF, p-value: < 2.2e-16
predictedprobability_lm<-predict(model0)
range(predictedprobability_lm)
## [1] -0.2519388 0.3608037
#the range can not be negative
#Return-Logit Ratio
a<-sum(return$return==1)
b<-sum(return$return==0)
#8 variables
print(a/8)
## [1] 19507.5
print(b/8)
## [1] 168739.2
#Logit Model
library(aod)
logit1<-glm(return~lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(BOPS)+factor(store_number)+factor(summary)+Month,data=return, family="binomial")
summary(logit1)
##
## Call:
## glm(formula = return ~ lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(BOPS) + factor(store_number) +
## factor(summary) + Month, family = "binomial", data = return)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1635 -0.4967 -0.4285 -0.3622 2.8011
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.7613358 0.0293407 -128.195 < 2e-16 ***
## lognet_purchase_amount 0.3641270 0.0037416 97.318 < 2e-16 ***
## factor(gender)1 -0.2989171 0.0056485 -52.919 < 2e-16 ***
## age_band 0.0011642 0.0007106 1.638 0.101368
## est_income_code 0.0245533 0.0012303 19.957 < 2e-16 ***
## factor(BOPS)1 0.1596335 0.0071852 22.217 < 2e-16 ***
## factor(store_number)6 -0.0406505 0.0110070 -3.693 0.000221 ***
## factor(store_number)5998 -0.4128376 0.1567756 -2.633 0.008456 **
## factor(summary)2 0.5473665 0.0187653 29.169 < 2e-16 ***
## factor(summary)3 -0.0360564 0.0224140 -1.609 0.107691
## factor(summary)4 -0.0186899 0.0162576 -1.150 0.250304
## factor(summary)5 -0.1348549 0.0172518 -7.817 5.42e-15 ***
## factor(summary)6 0.2942750 0.0207772 14.163 < 2e-16 ***
## factor(summary)7 -0.7952926 0.0271023 -29.344 < 2e-16 ***
## factor(summary)8 0.6134323 0.0700742 8.754 < 2e-16 ***
## factor(summary)9 -0.1303657 0.0276037 -4.723 2.33e-06 ***
## factor(summary)10 0.7178547 0.1809793 3.967 7.29e-05 ***
## factor(summary)11 -0.1856125 0.0216703 -8.565 < 2e-16 ***
## factor(summary)12 -0.1416295 0.0173568 -8.160 3.35e-16 ***
## factor(summary)13 -0.1605009 0.0232874 -6.892 5.49e-12 ***
## factor(summary)14 0.2498520 0.0279416 8.942 < 2e-16 ***
## factor(summary)15 0.1362366 0.5446474 0.250 0.802481
## factor(summary)17 -0.3083705 0.0446093 -6.913 4.76e-12 ***
## factor(summary)20 0.3458585 0.0204505 16.912 < 2e-16 ***
## factor(summary)21 -0.1240058 0.0177070 -7.003 2.50e-12 ***
## Month -0.0124147 0.0006839 -18.152 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1002920 on 1505973 degrees of freedom
## Residual deviance: 967982 on 1505948 degrees of freedom
## AIC: 968034
##
## Number of Fisher Scoring iterations: 6
logit2<-glm(return~net_purchase_amount+gender+age_band+est_income_code+factor(BOPS)+store_number+summary+Month,data=return, family="binomial")
summary(logit2)
##
## Call:
## glm(formula = return ~ net_purchase_amount + gender + age_band +
## est_income_code + factor(BOPS) + store_number + summary +
## Month, family = "binomial", data = return)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.4685 -0.4836 -0.4506 -0.4156 2.3747
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.978e+00 9.956e-03 -198.672 < 2e-16 ***
## net_purchase_amount 5.844e-04 6.657e-06 87.792 < 2e-16 ***
## gender -2.361e-01 5.502e-03 -42.913 < 2e-16 ***
## age_band -2.278e-03 7.004e-04 -3.252 0.00115 **
## est_income_code 1.918e-02 1.218e-03 15.752 < 2e-16 ***
## factor(BOPS)1 8.192e-02 7.076e-03 11.577 < 2e-16 ***
## store_number -5.302e-05 2.615e-05 -2.027 0.04262 *
## summary -1.473e-02 4.276e-04 -34.454 < 2e-16 ***
## Month -2.056e-02 6.711e-04 -30.640 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1002920 on 1505973 degrees of freedom
## Residual deviance: 990247 on 1505965 degrees of freedom
## AIC: 990265
##
## Number of Fisher Scoring iterations: 5
with(logit1, null.deviance - deviance)
## [1] 34938.21
with(logit1, df.null - df.residual)
## [1] 25
with(logit1, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 0
#p-value=0, significant
exp(coef(logit1))
## (Intercept) lognet_purchase_amount factor(gender)1
## 0.02325266 1.43925692 0.74162089
## age_band est_income_code factor(BOPS)1
## 1.00116485 1.02485721 1.17308081
## factor(store_number)6 factor(store_number)5998 factor(summary)2
## 0.96016464 0.66176977 1.72869453
## factor(summary)3 factor(summary)4 factor(summary)5
## 0.96458586 0.98148365 0.87384269
## factor(summary)6 factor(summary)7 factor(summary)8
## 1.34215297 0.45144912 1.84675921
## factor(summary)9 factor(summary)10 factor(summary)11
## 0.87777435 2.05003054 0.83059538
## factor(summary)12 factor(summary)13 factor(summary)14
## 0.86794279 0.85171707 1.28383540
## factor(summary)15 factor(summary)17 factor(summary)20
## 1.14595303 0.73464307 1.41320257
## factor(summary)21 Month
## 0.88337469 0.98766209
newdata1 <- with(return,data.frame(net_purchase_amount = mean(lognet_purchase_amount), gender = mean(gender), age_band = mean(age_band), est_income_code = mean(est_income_code), store_number = mean(store_number), summary = mean(summary), Month = mean(Month), BOPS = factor(c(0,1))))
newdata1
## net_purchase_amount gender age_band est_income_code store_number
## 1 4.589477 0.462427 5.070692 5.42532 4.436687
## 2 4.589477 0.462427 5.070692 5.42532 4.436687
## summary Month BOPS
## 1 10.21452 7.510413 0
## 2 10.21452 7.510413 1
newdata1$BOPSP <- predict(logit2, newdata = newdata1, type = "response", se.fit=FALSE)
newdata1
## net_purchase_amount gender age_band est_income_code store_number
## 1 4.589477 0.462427 5.070692 5.42532 4.436687
## 2 4.589477 0.462427 5.070692 5.42532 4.436687
## summary Month BOPS BOPSP
## 1 10.21452 7.510413 0 0.09136769
## 2 10.21452 7.510413 1 0.09840007
newdata2 <- with(return, data.frame(net_purchase_amount = rep(seq(from = 0, to = 40000, length.out = 1000), 2),
gender = mean(gender), age_band = mean(age_band), est_income_code = mean(est_income_code), store_number = mean(store_number), summary = mean(summary), Month = mean(Month), BOPS = factor(c(0,1))))
newdata3 <- cbind(newdata2, predict(logit2, newdata = newdata2, type="link", se=TRUE))
newdata3 <- within(newdata3, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))})
head(newdata3)
## net_purchase_amount gender age_band est_income_code store_number
## 1 0.00000 0.462427 5.070692 5.42532 4.436687
## 2 40.04004 0.462427 5.070692 5.42532 4.436687
## 3 80.08008 0.462427 5.070692 5.42532 4.436687
## 4 120.12012 0.462427 5.070692 5.42532 4.436687
## 5 160.16016 0.462427 5.070692 5.42532 4.436687
## 6 200.20020 0.462427 5.070692 5.42532 4.436687
## summary Month BOPS fit se.fit residual.scale UL
## 1 10.21452 7.510413 0 -2.299731 0.003301816 1 0.09168277
## 2 10.21452 7.510413 1 -2.194412 0.006542114 1 0.10141600
## 3 10.21452 7.510413 0 -2.252931 0.003111797 1 0.09562305
## 4 10.21452 7.510413 1 -2.147613 0.006470915 1 0.10574792
## 5 10.21452 7.510413 0 -2.206132 0.003005479 1 0.09972881
## 6 10.21452 7.510413 1 -2.100813 0.006443180 1 0.11025052
## LL PredictedProb
## 1 0.09061059 0.09114526
## 2 0.09910272 0.10025343
## 3 0.09457335 0.09509690
## 4 0.10337305 0.10455453
## 5 0.09867601 0.09920117
## 6 0.10779719 0.10901780
#ggplot(newdata3, aes(x = net_purchase_amount, y = PredictedProb)) +
# geom_ribbon(aes(ymin = LL, ymax = UL, fill = BOPS), alpha = .2) +
# geom_line(aes(colour = BOPS), size=1)
#Check Multicollineary - No
library(VIF)
library(usdm)
## Loading required package: sp
## Loading required package: raster
##
## Attaching package: 'usdm'
## The following object is masked from 'package:VIF':
##
## vif
Please refer to the Log of Net Purchase Amount Histogram.
df<-data.frame(return$net_purchase_amount,return$gender,return$age_band,return$est_income_code,return$BOPS,return$Month)
#take factor variables out (store_number,month)
cor(df)
## return.net_purchase_amount return.gender
## return.net_purchase_amount 1.0000000000 0.09756170
## return.gender 0.0975616969 1.00000000
## return.age_band 0.0003710182 0.04069056
## return.est_income_code -0.0024527297 0.06457983
## return.BOPS -0.0112614092 -0.03173979
## return.Month -0.0338921618 0.04898091
## return.age_band return.est_income_code
## return.net_purchase_amount 0.0003710182 -0.00245273
## return.gender 0.0406905558 0.06457983
## return.age_band 1.0000000000 0.17056570
## return.est_income_code 0.1705656962 1.00000000
## return.BOPS -0.0367181108 -0.01847097
## return.Month 0.0592982644 0.05080511
## return.BOPS return.Month
## return.net_purchase_amount -0.01126141 -0.03389216
## return.gender -0.03173979 0.04898091
## return.age_band -0.03671811 0.05929826
## return.est_income_code -0.01847097 0.05080511
## return.BOPS 1.00000000 -0.01238196
## return.Month -0.01238196 1.00000000
vif(df)
## Variables VIF
## 1 return.net_purchase_amount 1.013345
## 2 return.gender 1.019280
## 3 return.age_band 1.034647
## 4 return.est_income_code 1.037860
## 5 return.BOPS 1.002388
## 6 return.Month 1.007942
#Interaction - No
logit2<-glm(return~factor(BOPS)*factor(Cgroup)+factor(BOPS)+lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(store_number)+factor(Cgroup),data=return, family="binomial")
summary(logit2) #factor(BOPS)*factor(Cgroup) - sig
##
## Call:
## glm(formula = return ~ factor(BOPS) * factor(Cgroup) + factor(BOPS) +
## lognet_purchase_amount + factor(gender) + age_band + est_income_code +
## factor(store_number) + factor(Cgroup), family = "binomial",
## data = return)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3653 -0.4900 -0.4265 -0.3661 2.7120
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5800819 0.0186773 -191.681 < 2e-16 ***
## factor(BOPS)1 -0.0071149 0.0186994 -0.380 0.70358
## factor(Cgroup)2 -0.4985390 0.0094176 -52.937 < 2e-16 ***
## factor(Cgroup)3 -0.4874342 0.0100304 -48.596 < 2e-16 ***
## factor(Cgroup)4 -0.3009895 0.0126931 -23.713 < 2e-16 ***
## lognet_purchase_amount 0.3877956 0.0028052 138.241 < 2e-16 ***
## factor(gender)1 -0.3162343 0.0056024 -56.446 < 2e-16 ***
## age_band -0.0009188 0.0007068 -1.300 0.19361
## est_income_code 0.0232227 0.0012275 18.919 < 2e-16 ***
## factor(store_number)6 -0.0448871 0.0109929 -4.083 4.44e-05 ***
## factor(store_number)5998 -0.4420847 0.1568787 -2.818 0.00483 **
## factor(BOPS)1:factor(Cgroup)2 0.1561221 0.0216240 7.220 5.20e-13 ***
## factor(BOPS)1:factor(Cgroup)3 0.1843194 0.0225099 8.188 2.65e-16 ***
## factor(BOPS)1:factor(Cgroup)4 0.1802891 0.0290310 6.210 5.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1002920 on 1505973 degrees of freedom
## Residual deviance: 971817 on 1505960 degrees of freedom
## AIC: 971845
##
## Number of Fisher Scoring iterations: 5
logit3<-glm(return~factor(BOPS)*factor(store_number)+factor(BOPS)+lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(store_number)+factor(Cgroup),data=return, family="binomial")
summary(logit3) #factor(BOPS)*factor(store_number) - sig
##
## Call:
## glm(formula = return ~ factor(BOPS) * factor(store_number) +
## factor(BOPS) + lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(store_number) + factor(Cgroup),
## family = "binomial", data = return)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3668 -0.4898 -0.4265 -0.3663 2.8687
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.6053600 0.0184928 -194.961
## factor(BOPS)1 0.1433055 0.0072977 19.637
## factor(store_number)6 -0.0306352 0.0115736 -2.647
## factor(store_number)5998 -0.2868829 0.1617701 -1.773
## lognet_purchase_amount 0.3873806 0.0028027 138.215
## factor(gender)1 -0.3166053 0.0056023 -56.513
## age_band -0.0009501 0.0007067 -1.344
## est_income_code 0.0231677 0.0012275 18.874
## factor(Cgroup)2 -0.4696920 0.0085980 -54.628
## factor(Cgroup)3 -0.4539872 0.0092147 -49.268
## factor(Cgroup)4 -0.2674185 0.0115037 -23.246
## factor(BOPS)1:factor(store_number)6 -0.1458529 0.0369676 -3.945
## factor(BOPS)1:factor(store_number)5998 -1.5495000 0.7303080 -2.122
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## factor(BOPS)1 < 2e-16 ***
## factor(store_number)6 0.00812 **
## factor(store_number)5998 0.07616 .
## lognet_purchase_amount < 2e-16 ***
## factor(gender)1 < 2e-16 ***
## age_band 0.17883
## est_income_code < 2e-16 ***
## factor(Cgroup)2 < 2e-16 ***
## factor(Cgroup)3 < 2e-16 ***
## factor(Cgroup)4 < 2e-16 ***
## factor(BOPS)1:factor(store_number)6 7.97e-05 ***
## factor(BOPS)1:factor(store_number)5998 0.03386 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1002920 on 1505973 degrees of freedom
## Residual deviance: 971868 on 1505961 degrees of freedom
## AIC: 971894
##
## Number of Fisher Scoring iterations: 5
#try
logit4<-glm(return~factor(BOPS)*age_band+factor(BOPS)+lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(store_number)+factor(Cgroup),data=return, family="binomial")
summary(logit4) #factor(BOPS)*age_band - not sig
##
## Call:
## glm(formula = return ~ factor(BOPS) * age_band + factor(BOPS) +
## lognet_purchase_amount + factor(gender) + age_band + est_income_code +
## factor(store_number) + factor(Cgroup), family = "binomial",
## data = return)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3684 -0.4898 -0.4265 -0.3662 2.7082
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6015867 0.0185463 -194.195 < 2e-16 ***
## factor(BOPS)1 0.1217796 0.0114277 10.657 < 2e-16 ***
## age_band -0.0014620 0.0007714 -1.895 0.05805 .
## lognet_purchase_amount 0.3873074 0.0028027 138.190 < 2e-16 ***
## factor(gender)1 -0.3166672 0.0056026 -56.521 < 2e-16 ***
## est_income_code 0.0232088 0.0012274 18.908 < 2e-16 ***
## factor(store_number)6 -0.0453904 0.0109917 -4.130 3.64e-05 ***
## factor(store_number)5998 -0.4387998 0.1568645 -2.797 0.00515 **
## factor(Cgroup)2 -0.4694972 0.0085977 -54.607 < 2e-16 ***
## factor(Cgroup)3 -0.4539735 0.0092147 -49.266 < 2e-16 ***
## factor(Cgroup)4 -0.2673146 0.0115035 -23.238 < 2e-16 ***
## factor(BOPS)1:age_band 0.0032226 0.0018669 1.726 0.08431 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1002920 on 1505973 degrees of freedom
## Residual deviance: 971888 on 1505962 degrees of freedom
## AIC: 971912
##
## Number of Fisher Scoring iterations: 5
AIC(logit1,logit2,logit3) #logit2
## df AIC
## logit1 26 968034.1
## logit2 14 971844.9
## logit3 13 971894.5
BIC(logit1,logit2,logit3) #logit2
## df BIC
## logit1 26 968351.9
## logit2 14 972016.1
## logit3 13 972053.4
#Endogeneity - (omitted variable: price sensitive customers)
library(AER)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:usdm':
##
## vif
## The following object is masked from 'package:VIF':
##
## vif
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:aod':
##
## rats
library(foreign)
df<-data.frame(return$return,return$BOPS,return$net_purchase_amount,return$gender,return$age_band,return$est_income_code,return$homeowner_code,return$length_of_residence,return$child)
cor(df)
## return.return return.BOPS
## return.return 1.000000000 0.01050394
## return.BOPS 0.010503940 1.00000000
## return.net_purchase_amount 0.088305830 -0.01126141
## return.gender -0.028025026 -0.03173979
## return.age_band -0.004141329 -0.03671811
## return.est_income_code 0.008205077 -0.01847097
## return.homeowner_code -0.003979457 -0.01902712
## return.length_of_residence -0.009519720 -0.01121900
## return.child -0.001182970 0.02071476
## return.net_purchase_amount return.gender
## return.return 0.0883058304 -0.02802503
## return.BOPS -0.0112614092 -0.03173979
## return.net_purchase_amount 1.0000000000 0.09756170
## return.gender 0.0975616969 1.00000000
## return.age_band 0.0003710182 0.04069056
## return.est_income_code -0.0024527297 0.06457983
## return.homeowner_code -0.0096625761 0.05083119
## return.length_of_residence -0.0206360086 0.02418361
## return.child -0.0222139305 -0.01770479
## return.age_band return.est_income_code
## return.return -0.0041413288 0.008205077
## return.BOPS -0.0367181108 -0.018470973
## return.net_purchase_amount 0.0003710182 -0.002452730
## return.gender 0.0406905558 0.064579830
## return.age_band 1.0000000000 0.170565696
## return.est_income_code 0.1705656962 1.000000000
## return.homeowner_code 0.2410726302 0.355759045
## return.length_of_residence 0.1622123028 0.160235066
## return.child 0.0175396345 0.091726525
## return.homeowner_code
## return.return -0.003979457
## return.BOPS -0.019027118
## return.net_purchase_amount -0.009662576
## return.gender 0.050831190
## return.age_band 0.241072630
## return.est_income_code 0.355759045
## return.homeowner_code 1.000000000
## return.length_of_residence 0.293794532
## return.child 0.216630218
## return.length_of_residence return.child
## return.return -0.00951972 -0.00118297
## return.BOPS -0.01121900 0.02071476
## return.net_purchase_amount -0.02063601 -0.02221393
## return.gender 0.02418361 -0.01770479
## return.age_band 0.16221230 0.01753963
## return.est_income_code 0.16023507 0.09172652
## return.homeowner_code 0.29379453 0.21663022
## return.length_of_residence 1.00000000 -0.02857371
## return.child -0.02857371 1.00000000
model1<- ivreg(return~factor(BOPS)+lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(store_number)+factor(summary)+Month|factor(child)+length_of_residence+lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(store_number)+factor(summary)+Month,data=return)
summary(model1,diagnostics = TRUE)
##
## Call:
## ivreg(formula = return ~ factor(BOPS) + lognet_purchase_amount +
## factor(gender) + age_band + est_income_code + factor(store_number) +
## factor(summary) + Month | factor(child) + length_of_residence +
## lognet_purchase_amount + factor(gender) + age_band + est_income_code +
## factor(store_number) + factor(summary) + Month, data = return)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46223 -0.12705 -0.08120 -0.04378 1.05382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.085e-02 1.306e-02 -5.425 5.79e-08 ***
## factor(BOPS)1 1.294e-01 3.579e-02 3.614 0.000301 ***
## lognet_purchase_amount 3.685e-02 6.917e-04 53.270 < 2e-16 ***
## factor(gender)1 -2.640e-02 6.485e-04 -40.709 < 2e-16 ***
## age_band 5.112e-04 1.336e-04 3.827 0.000130 ***
## est_income_code 2.468e-03 1.274e-04 19.377 < 2e-16 ***
## factor(store_number)6 5.868e-03 3.059e-03 1.918 0.055073 .
## factor(store_number)5998 -3.693e-02 1.307e-02 -2.826 0.004712 **
## factor(summary)2 5.605e-02 2.916e-03 19.221 < 2e-16 ***
## factor(summary)3 -3.055e-03 2.980e-03 -1.025 0.305222
## factor(summary)4 -7.183e-03 3.958e-03 -1.815 0.069514 .
## factor(summary)5 -1.925e-02 3.670e-03 -5.246 1.56e-07 ***
## factor(summary)6 2.734e-02 2.628e-03 10.402 < 2e-16 ***
## factor(summary)7 -5.088e-02 2.748e-03 -18.517 < 2e-16 ***
## factor(summary)8 1.093e-01 1.066e-02 10.247 < 2e-16 ***
## factor(summary)9 -2.010e-02 2.509e-03 -8.014 1.12e-15 ***
## factor(summary)10 1.293e-01 2.524e-02 5.121 3.03e-07 ***
## factor(summary)11 -2.072e-02 5.048e-03 -4.104 4.06e-05 ***
## factor(summary)12 -2.232e-02 2.891e-03 -7.720 1.16e-14 ***
## factor(summary)13 -2.266e-02 4.046e-03 -5.601 2.13e-08 ***
## factor(summary)14 2.990e-02 6.532e-03 4.577 4.72e-06 ***
## factor(summary)15 2.511e-02 5.906e-02 0.425 0.670739
## factor(summary)17 -3.212e-02 5.269e-03 -6.095 1.09e-09 ***
## factor(summary)20 4.802e-02 3.118e-03 15.403 < 2e-16 ***
## factor(summary)21 -1.725e-02 3.460e-03 -4.986 6.17e-07 ***
## Month -1.000e-03 7.588e-05 -13.179 < 2e-16 ***
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 2 1505947 266.12 < 2e-16 ***
## Wu-Hausman 1 1505947 10.46 0.00122 **
## Sargan 1 NA 29.04 7.11e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3041 on 1505948 degrees of freedom
## Multiple R-Squared: 0.004632, Adjusted R-squared: 0.004615
## Wald test: 1422 on 25 and 1505948 DF, p-value: < 2.2e-16
#Marginal effects
library(mfx)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:raster':
##
## area, select
## Loading required package: betareg
logitmfx(return~lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(BOPS)+factor(store_number)+factor(summary)+Month,data=return)
## Call:
## logitmfx(formula = return ~ lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(BOPS) + factor(store_number) +
## factor(summary) + Month, data = return)
##
## Marginal Effects:
## dF/dx Std. Err. z P>|z|
## lognet_purchase_amount 3.1182e-02 3.1585e-04 98.7253 < 2.2e-16 ***
## factor(gender)1 -2.5412e-02 4.7523e-04 -53.4726 < 2.2e-16 ***
## age_band 9.9694e-05 6.0854e-05 1.6383 0.1013677
## est_income_code 2.1026e-03 1.0529e-04 19.9705 < 2.2e-16 ***
## factor(BOPS)1 1.4278e-02 6.7014e-04 21.3056 < 2.2e-16 ***
## factor(store_number)6 -3.4321e-03 9.1613e-04 -3.7463 0.0001794 ***
## factor(store_number)5998 -2.9925e-02 9.4841e-03 -3.1553 0.0016032 **
## factor(summary)2 5.7325e-02 2.3512e-03 24.3808 < 2.2e-16 ***
## factor(summary)3 -3.0443e-03 1.8656e-03 -1.6318 0.1027287
## factor(summary)4 -1.5917e-03 1.3770e-03 -1.1559 0.2477087
## factor(summary)5 -1.1160e-02 1.3794e-03 -8.0902 5.957e-16 ***
## factor(summary)6 2.8228e-02 2.2172e-03 12.7313 < 2.2e-16 ***
## factor(summary)7 -5.1576e-02 1.2751e-03 -40.4491 < 2.2e-16 ***
## factor(summary)8 6.7113e-02 9.4938e-03 7.0691 1.559e-12 ***
## factor(summary)9 -1.0619e-02 2.1360e-03 -4.9715 6.645e-07 ***
## factor(summary)10 8.1792e-02 2.6288e-02 3.1114 0.0018621 **
## factor(summary)11 -1.4801e-02 1.6055e-03 -9.2192 < 2.2e-16 ***
## factor(summary)12 -1.1737e-02 1.3919e-03 -8.4323 < 2.2e-16 ***
## factor(summary)13 -1.2917e-02 1.7582e-03 -7.3466 2.034e-13 ***
## factor(summary)14 2.3609e-02 2.8984e-03 8.1456 3.775e-16 ***
## factor(summary)15 1.2329e-02 5.2003e-02 0.2371 0.8125985
## factor(summary)17 -2.3324e-02 2.9568e-03 -7.8883 3.062e-15 ***
## factor(summary)20 3.3880e-02 2.2702e-03 14.9237 < 2.2e-16 ***
## factor(summary)21 -1.0300e-02 1.4262e-03 -7.2214 5.144e-13 ***
## Month -1.0631e-03 5.8532e-05 -18.1633 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## dF/dx is for discrete change for the following variables:
##
## [1] "factor(gender)1" "factor(BOPS)1"
## [3] "factor(store_number)6" "factor(store_number)5998"
## [5] "factor(summary)2" "factor(summary)3"
## [7] "factor(summary)4" "factor(summary)5"
## [9] "factor(summary)6" "factor(summary)7"
## [11] "factor(summary)8" "factor(summary)9"
## [13] "factor(summary)10" "factor(summary)11"
## [15] "factor(summary)12" "factor(summary)13"
## [17] "factor(summary)14" "factor(summary)15"
## [19] "factor(summary)17" "factor(summary)20"
## [21] "factor(summary)21"
#Heteroscedasticity - Yes
library(lmtest)
gqtest(logit1) # Goldfeld-Quandt test indicates no heteroscedasticity
##
## Goldfeld-Quandt test
##
## data: logit1
## GQ = 0.92468, df1 = 752960, df2 = 752960, p-value = 1
## alternative hypothesis: variance increases from segment 1 to 2
bptest(logit1) # Breusch-Pagan test indicates heteroscedasticity
##
## studentized Breusch-Pagan test
##
## data: logit1
## BP = 34700, df = 25, p-value < 2.2e-16
#Fix
library(sandwich)
library(foreign)
logitmfx(return~lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(BOPS)+factor(store_number)+factor(summary)+factor(month),data=return,robust=TRUE)
## Call:
## logitmfx(formula = return ~ lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(BOPS) + factor(store_number) +
## factor(summary) + factor(month), data = return, robust = TRUE)
##
## Marginal Effects:
## dF/dx Std. Err. z P>|z|
## lognet_purchase_amount 3.1059e-02 3.2135e-04 96.6497 < 2.2e-16 ***
## factor(gender)1 -2.4543e-02 4.7861e-04 -51.2797 < 2.2e-16 ***
## age_band 1.2059e-04 6.0568e-05 1.9910 0.0464782 *
## est_income_code 2.0158e-03 1.0509e-04 19.1824 < 2.2e-16 ***
## factor(BOPS)1 1.4514e-02 6.7253e-04 21.5810 < 2.2e-16 ***
## factor(store_number)6 -3.7439e-03 9.1023e-04 -4.1132 3.903e-05 ***
## factor(store_number)5998 -2.9911e-02 9.4870e-03 -3.1529 0.0016168 **
## factor(summary)2 5.6267e-02 2.3677e-03 23.7651 < 2.2e-16 ***
## factor(summary)3 -3.1022e-03 1.8715e-03 -1.6576 0.0973953 .
## factor(summary)4 -4.9080e-04 1.4029e-03 -0.3498 0.7264578
## factor(summary)5 -1.0256e-02 1.4102e-03 -7.2731 3.514e-13 ***
## factor(summary)6 2.8304e-02 2.2375e-03 12.6499 < 2.2e-16 ***
## factor(summary)7 -4.9457e-02 1.3357e-03 -37.0280 < 2.2e-16 ***
## factor(summary)8 6.4702e-02 9.4592e-03 6.8400 7.917e-12 ***
## factor(summary)9 -8.7872e-03 2.1929e-03 -4.0071 6.148e-05 ***
## factor(summary)10 7.6201e-02 2.6036e-02 2.9268 0.0034250 **
## factor(summary)11 -1.3640e-02 1.6307e-03 -8.3650 < 2.2e-16 ***
## factor(summary)12 -1.0767e-02 1.4173e-03 -7.5968 3.036e-14 ***
## factor(summary)13 -1.2119e-02 1.7817e-03 -6.8015 1.035e-11 ***
## factor(summary)14 2.2609e-02 2.8992e-03 7.7983 6.276e-15 ***
## factor(summary)15 1.4001e-02 5.2712e-02 0.2656 0.7905331
## factor(summary)17 -2.4238e-02 2.9607e-03 -8.1864 2.692e-16 ***
## factor(summary)20 3.3984e-02 2.2820e-03 14.8921 < 2.2e-16 ***
## factor(summary)21 -9.1960e-03 1.4591e-03 -6.3027 2.925e-10 ***
## factor(month)AUG 8.5264e-03 1.4798e-03 5.7618 8.323e-09 ***
## factor(month)DEC -1.0010e-02 1.0274e-03 -9.7436 < 2.2e-16 ***
## factor(month)FEB -3.8772e-03 1.1540e-03 -3.3597 0.0007801 ***
## factor(month)JAN 2.0431e-02 1.4301e-03 14.2865 < 2.2e-16 ***
## factor(month)JUL 1.5218e-02 1.5595e-03 9.7585 < 2.2e-16 ***
## factor(month)JUN 1.0233e-02 1.5143e-03 6.7575 1.404e-11 ***
## factor(month)MAR 7.3426e-03 1.4108e-03 5.2045 1.945e-07 ***
## factor(month)MAY -8.0777e-03 1.1786e-03 -6.8536 7.200e-12 ***
## factor(month)NOV 5.8716e-04 1.1795e-03 0.4978 0.6186103
## factor(month)OCT 1.1215e-02 1.4649e-03 7.6557 1.923e-14 ***
## factor(month)SEP 4.3963e-03 1.4601e-03 3.0109 0.0026045 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## dF/dx is for discrete change for the following variables:
##
## [1] "factor(gender)1" "factor(BOPS)1"
## [3] "factor(store_number)6" "factor(store_number)5998"
## [5] "factor(summary)2" "factor(summary)3"
## [7] "factor(summary)4" "factor(summary)5"
## [9] "factor(summary)6" "factor(summary)7"
## [11] "factor(summary)8" "factor(summary)9"
## [13] "factor(summary)10" "factor(summary)11"
## [15] "factor(summary)12" "factor(summary)13"
## [17] "factor(summary)14" "factor(summary)15"
## [19] "factor(summary)17" "factor(summary)20"
## [21] "factor(summary)21" "factor(month)AUG"
## [23] "factor(month)DEC" "factor(month)FEB"
## [25] "factor(month)JAN" "factor(month)JUL"
## [27] "factor(month)JUN" "factor(month)MAR"
## [29] "factor(month)MAY" "factor(month)NOV"
## [31] "factor(month)OCT" "factor(month)SEP"
#Prediction
pred = predict(logit2, data=return)
return_prediction <- ifelse(pred >= 0.5,1,0)
misClasificError <- mean(return_prediction != return$return)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.896372712941923"
table(return$return, pred>=0.5)
##
## FALSE
## 0 1349914
## 1 156060
# Probit model
probit1<- glm(return~lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(BOPS)+factor(store_number)+factor(summary)+Month,data=return, family=binomial(link="probit")) # This is the command to run a probit regression
summary(probit1)
##
## Call:
## glm(formula = return ~ lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(BOPS) + factor(store_number) +
## factor(summary) + Month, family = binomial(link = "probit"),
## data = return)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0982 -0.4989 -0.4294 -0.3592 2.8448
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0867002 0.0155984 -133.776 < 2e-16 ***
## lognet_purchase_amount 0.1909827 0.0019677 97.060 < 2e-16 ***
## factor(gender)1 -0.1539945 0.0029235 -52.676 < 2e-16 ***
## age_band 0.0004790 0.0003687 1.299 0.193914
## est_income_code 0.0128334 0.0006380 20.115 < 2e-16 ***
## factor(BOPS)1 0.0863040 0.0037618 22.942 < 2e-16 ***
## factor(store_number)6 -0.0219783 0.0056607 -3.883 0.000103 ***
## factor(store_number)5998 -0.2069866 0.0783826 -2.641 0.008273 **
## factor(summary)2 0.2844931 0.0104766 27.155 < 2e-16 ***
## factor(summary)3 -0.0231000 0.0125966 -1.834 0.066680 .
## factor(summary)4 -0.0261801 0.0090428 -2.895 0.003790 **
## factor(summary)5 -0.0857045 0.0094719 -9.048 < 2e-16 ***
## factor(summary)6 0.1544633 0.0116097 13.305 < 2e-16 ***
## factor(summary)7 -0.3698521 0.0133070 -27.794 < 2e-16 ***
## factor(summary)8 0.3458561 0.0413541 8.363 < 2e-16 ***
## factor(summary)9 -0.0685159 0.0140869 -4.864 1.15e-06 ***
## factor(summary)10 0.4038945 0.1070925 3.771 0.000162 ***
## factor(summary)11 -0.1125962 0.0117615 -9.573 < 2e-16 ***
## factor(summary)12 -0.0850843 0.0095242 -8.933 < 2e-16 ***
## factor(summary)13 -0.0994907 0.0125047 -7.956 1.77e-15 ***
## factor(summary)14 0.1115142 0.0150707 7.399 1.37e-13 ***
## factor(summary)15 0.0587110 0.2974834 0.197 0.843547
## factor(summary)17 -0.1582564 0.0251298 -6.298 3.02e-10 ***
## factor(summary)20 0.1855339 0.0116398 15.940 < 2e-16 ***
## factor(summary)21 -0.0771094 0.0096869 -7.960 1.72e-15 ***
## Month -0.0063122 0.0003546 -17.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1002920 on 1505973 degrees of freedom
## Residual deviance: 967932 on 1505948 degrees of freedom
## AIC: 967984
##
## Number of Fisher Scoring iterations: 5
with(probit1, null.deviance - deviance)
## [1] 34987.98
with(probit1, df.null - df.residual)
## [1] 25
with(probit1, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 0
probitmfx(formula=return~lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(BOPS)+factor(store_number)+factor(summary)+Month,data=return)
## Call:
## probitmfx(formula = return ~ lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(BOPS) + factor(store_number) +
## factor(summary) + Month, data = return)
##
## Marginal Effects:
## dF/dx Std. Err. z P>|z|
## lognet_purchase_amount 3.2544e-02 3.3330e-04 97.6429 < 2.2e-16 ***
## factor(gender)1 -2.6062e-02 4.9068e-04 -53.1143 < 2.2e-16 ***
## age_band 8.1619e-05 6.2828e-05 1.2991 0.193914
## est_income_code 2.1869e-03 1.0869e-04 20.1210 < 2.2e-16 ***
## factor(BOPS)1 1.5267e-02 6.8985e-04 22.1304 < 2.2e-16 ***
## factor(store_number)6 -3.6992e-03 9.4093e-04 -3.9314 8.446e-05 ***
## factor(store_number)5998 -3.0710e-02 9.9827e-03 -3.0763 0.002096 **
## factor(summary)2 5.7007e-02 2.4207e-03 23.5498 < 2.2e-16 ***
## factor(summary)3 -3.8791e-03 2.0842e-03 -1.8612 0.062720 .
## factor(summary)4 -4.4059e-03 1.5029e-03 -2.9316 0.003373 **
## factor(summary)5 -1.4097e-02 1.5030e-03 -9.3794 < 2.2e-16 ***
## factor(summary)6 2.8909e-02 2.3710e-03 12.1926 < 2.2e-16 ***
## factor(summary)7 -5.0598e-02 1.4086e-03 -35.9201 < 2.2e-16 ***
## factor(summary)8 7.2830e-02 1.0417e-02 6.9917 2.715e-12 ***
## factor(summary)9 -1.1187e-02 2.2011e-03 -5.0826 3.722e-07 ***
## factor(summary)10 8.7879e-02 2.8482e-02 3.0854 0.002033 **
## factor(summary)11 -1.7878e-02 1.7342e-03 -10.3093 < 2.2e-16 ***
## factor(summary)12 -1.4042e-02 1.5216e-03 -9.2280 < 2.2e-16 ***
## factor(summary)13 -1.5922e-02 1.8743e-03 -8.4948 < 2.2e-16 ***
## factor(summary)14 2.0373e-02 2.9412e-03 6.9267 4.308e-12 ***
## factor(summary)15 1.0392e-02 5.4633e-02 0.1902 0.849146
## factor(summary)17 -2.4283e-02 3.4443e-03 -7.0504 1.784e-12 ***
## factor(summary)20 3.5401e-02 2.4639e-03 14.3679 < 2.2e-16 ***
## factor(summary)21 -1.2740e-02 1.5512e-03 -8.2133 < 2.2e-16 ***
## Month -1.0756e-03 6.0414e-05 -17.8040 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## dF/dx is for discrete change for the following variables:
##
## [1] "factor(gender)1" "factor(BOPS)1"
## [3] "factor(store_number)6" "factor(store_number)5998"
## [5] "factor(summary)2" "factor(summary)3"
## [7] "factor(summary)4" "factor(summary)5"
## [9] "factor(summary)6" "factor(summary)7"
## [11] "factor(summary)8" "factor(summary)9"
## [13] "factor(summary)10" "factor(summary)11"
## [15] "factor(summary)12" "factor(summary)13"
## [17] "factor(summary)14" "factor(summary)15"
## [19] "factor(summary)17" "factor(summary)20"
## [21] "factor(summary)21"
probitmfx(formula=return~lognet_purchase_amount+factor(gender)+age_band+est_income_code+factor(BOPS)+factor(store_number)+factor(summary)+Month,data=return, robust=TRUE)
## Call:
## probitmfx(formula = return ~ lognet_purchase_amount + factor(gender) +
## age_band + est_income_code + factor(BOPS) + factor(store_number) +
## factor(summary) + Month, data = return, robust = TRUE)
##
## Marginal Effects:
## dF/dx Std. Err. z P>|z|
## lognet_purchase_amount 3.2544e-02 3.4156e-04 95.2822 < 2.2e-16 ***
## factor(gender)1 -2.6062e-02 4.9488e-04 -52.6634 < 2.2e-16 ***
## age_band 8.1619e-05 6.2714e-05 1.3014 0.193106
## est_income_code 2.1869e-03 1.0876e-04 20.1078 < 2.2e-16 ***
## factor(BOPS)1 1.5267e-02 6.9166e-04 22.0724 < 2.2e-16 ***
## factor(store_number)6 -3.6992e-03 9.3854e-04 -3.9414 8.101e-05 ***
## factor(store_number)5998 -3.0710e-02 1.0110e-02 -3.0377 0.002384 **
## factor(summary)2 5.7007e-02 2.4499e-03 23.2691 < 2.2e-16 ***
## factor(summary)3 -3.8791e-03 2.0951e-03 -1.8515 0.064096 .
## factor(summary)4 -4.4059e-03 1.5197e-03 -2.8991 0.003742 **
## factor(summary)5 -1.4097e-02 1.5283e-03 -9.2240 < 2.2e-16 ***
## factor(summary)6 2.8909e-02 2.3947e-03 12.0721 < 2.2e-16 ***
## factor(summary)7 -5.0598e-02 1.4348e-03 -35.2655 < 2.2e-16 ***
## factor(summary)8 7.2830e-02 1.0475e-02 6.9530 3.575e-12 ***
## factor(summary)9 -1.1187e-02 2.2259e-03 -5.0259 5.010e-07 ***
## factor(summary)10 8.7879e-02 2.8740e-02 3.0577 0.002230 **
## factor(summary)11 -1.7878e-02 1.7428e-03 -10.2584 < 2.2e-16 ***
## factor(summary)12 -1.4042e-02 1.5410e-03 -9.1123 < 2.2e-16 ***
## factor(summary)13 -1.5922e-02 1.8875e-03 -8.4355 < 2.2e-16 ***
## factor(summary)14 2.0373e-02 2.9565e-03 6.8908 5.549e-12 ***
## factor(summary)15 1.0392e-02 5.4477e-02 0.1908 0.848720
## factor(summary)17 -2.4283e-02 3.5022e-03 -6.9337 4.100e-12 ***
## factor(summary)20 3.5401e-02 2.4743e-03 14.3078 < 2.2e-16 ***
## factor(summary)21 -1.2740e-02 1.5754e-03 -8.0870 6.116e-16 ***
## Month -1.0756e-03 5.9904e-05 -17.9558 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## dF/dx is for discrete change for the following variables:
##
## [1] "factor(gender)1" "factor(BOPS)1"
## [3] "factor(store_number)6" "factor(store_number)5998"
## [5] "factor(summary)2" "factor(summary)3"
## [7] "factor(summary)4" "factor(summary)5"
## [9] "factor(summary)6" "factor(summary)7"
## [11] "factor(summary)8" "factor(summary)9"
## [13] "factor(summary)10" "factor(summary)11"
## [15] "factor(summary)12" "factor(summary)13"
## [17] "factor(summary)14" "factor(summary)15"
## [19] "factor(summary)17" "factor(summary)20"
## [21] "factor(summary)21"
AIC(logit1,probit1)
## df AIC
## logit1 26 968034.1
## probit1 26 967984.3
BIC(logit1,probit1)
## df BIC
## logit1 26 968351.9
## probit1 26 968302.2
#read the data
newdata = read.csv("new_project_monthlysales_Final.csv",header = TRUE)
summary(newdata[,1:9])
## store_number summary month_index monthly_sales
## Min. : 2 Min. : 1.000 Min. :13.0 Min. : 19
## 1st Qu.: 2 1st Qu.: 4.000 1st Qu.:19.0 1st Qu.: 9746
## Median : 6 Median : 9.000 Median :25.0 Median : 29009
## Mean :1677 Mean : 9.356 Mean :25.4 Mean : 166785
## 3rd Qu.:5998 3rd Qu.:13.000 3rd Qu.:31.0 3rd Qu.: 174640
## Max. :5998 Max. :21.000 Max. :37.0 Max. :4727543
##
## pol_change month year treatment
## Min. :0.0000 AUG :122 Min. :2010 Min. :0.0000
## 1st Qu.:0.0000 DEC : 92 1st Qu.:2011 1st Qu.:0.0000
## Median :1.0000 JAN : 90 Median :2011 Median :1.0000
## Mean :0.5399 JUL : 90 Mean :2011 Mean :0.7208
## 3rd Qu.:1.0000 JUN : 90 3rd Qu.:2012 3rd Qu.:1.0000
## Max. :1.0000 FEB : 89 Max. :2012 Max. :1.0000
## (Other):505
## bridal
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2004
## 3rd Qu.:0.0000
## Max. :1.0000
##
#define all variables
monthly_sales=newdata$monthly_sales
hist(monthly_sales)
Please refer to the Monthly Sales Histogram.
log_sales=log(monthly_sales)
hist(log_sales) ##normal
Please refer to the Log of Monthly Sales Histogram.
store=as.factor(newdata$store_number)
category=as.factor(newdata$summary)
time_index=newdata$month_index
BOPS=as.factor(newdata$pol_change)
month=as.factor(newdata$month)
year=as.factor(newdata$year)
treatment=as.factor(newdata$treatment)
bridal=as.factor(newdata$bridal)
newdata$Nov=as.numeric(newdata$month=="NOV")
newdata$Dec=as.numeric(newdata$month=="DEC")
Nov=as.factor(newdata$Nov)
Dec=as.factor(newdata$Dec)
summary(newdata)
## store_number summary month_index monthly_sales
## Min. : 2 Min. : 1.000 Min. :13.0 Min. : 19
## 1st Qu.: 2 1st Qu.: 4.000 1st Qu.:19.0 1st Qu.: 9746
## Median : 6 Median : 9.000 Median :25.0 Median : 29009
## Mean :1677 Mean : 9.356 Mean :25.4 Mean : 166785
## 3rd Qu.:5998 3rd Qu.:13.000 3rd Qu.:31.0 3rd Qu.: 174640
## Max. :5998 Max. :21.000 Max. :37.0 Max. :4727543
##
## pol_change month year treatment
## Min. :0.0000 AUG :122 Min. :2010 Min. :0.0000
## 1st Qu.:0.0000 DEC : 92 1st Qu.:2011 1st Qu.:0.0000
## Median :1.0000 JAN : 90 Median :2011 Median :1.0000
## Mean :0.5399 JUL : 90 Mean :2011 Mean :0.7208
## 3rd Qu.:1.0000 JUN : 90 3rd Qu.:2012 3rd Qu.:1.0000
## Max. :1.0000 FEB : 89 Max. :2012 Max. :1.0000
## (Other):505
## bridal Nov Dec
## Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.2004 Mean :0.08163 Mean :0.08534
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000
##
#check factor variable correlation
#predicitibility matrix
library(GoodmanKruskal)
GK_correlation_matrix <- GKtauDataframe(newdata[,c(1,2,5,6,7,8,9)])
plot(GK_correlation_matrix,diagSize = 0.8,diagColor = "black",
backgroundColor = "white",colorPlot=FALSE)
Please refer to the Correlation Chart.
# OLS model
model1 = lm(log_sales~store+category+time_index+BOPS+month+year+treatment+bridal)
summary(model1) ##all 8 variables
##
## Call:
## lm(formula = log_sales ~ store + category + time_index + BOPS +
## month + year + treatment + bridal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9619 -0.2468 0.0341 0.3215 3.3827
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.815873 0.293905 43.605 < 2e-16 ***
## store6 -2.578039 0.058301 -44.219 < 2e-16 ***
## store5998 -3.122738 0.061490 -50.785 < 2e-16 ***
## category2 -1.134516 0.129422 -8.766 < 2e-16 ***
## category3 -0.243113 0.129422 -1.878 0.0606 .
## category4 0.688088 0.129422 5.317 1.29e-07 ***
## category5 0.188185 0.129422 1.454 0.1462
## category6 -1.023648 0.129422 -7.909 6.56e-15 ***
## category7 -2.725687 0.129422 -21.061 < 2e-16 ***
## category8 -4.061074 0.168504 -24.101 < 2e-16 ***
## category9 -4.091243 0.137257 -29.807 < 2e-16 ***
## category10 -5.505318 0.199320 -27.621 < 2e-16 ***
## category11 -0.668600 0.129422 -5.166 2.86e-07 ***
## category12 0.239848 0.129422 1.853 0.0641 .
## category13 -1.129676 0.129422 -8.729 < 2e-16 ***
## category14 -3.013735 0.130850 -23.032 < 2e-16 ***
## category15 -6.844201 0.254349 -26.909 < 2e-16 ***
## category17 -0.789919 0.183319 -4.309 1.79e-05 ***
## category20 -0.748230 0.129422 -5.781 9.78e-09 ***
## category21 -0.115835 0.129422 -0.895 0.3710
## time_index 0.001325 0.012451 0.106 0.9153
## BOPS1 0.172373 0.161582 1.067 0.2863
## monthAUG 0.130159 0.119283 1.091 0.2754
## monthDEC 1.292728 0.126439 10.224 < 2e-16 ***
## monthFEB 0.563720 0.119724 4.709 2.83e-06 ***
## monthJAN 0.214273 0.122653 1.747 0.0809 .
## monthJUL 0.113400 0.122614 0.925 0.3553
## monthJUN 0.004471 0.119447 0.037 0.9701
## monthMAR 0.056186 0.118179 0.475 0.6346
## monthMAY 0.496446 0.118130 4.203 2.87e-05 ***
## monthNOV 0.646009 0.132919 4.860 1.35e-06 ***
## monthOCT 0.062955 0.142703 0.441 0.6592
## monthSEP 0.011479 0.149530 0.077 0.9388
## year2011 -0.059301 0.049941 -1.187 0.2353
## year2012 NA NA NA NA
## treatment1 NA NA NA NA
## bridal1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7765 on 1044 degrees of freedom
## Multiple R-squared: 0.8669, Adjusted R-squared: 0.8627
## F-statistic: 206.1 on 33 and 1044 DF, p-value: < 2.2e-16
model11 = lm(log_sales~store+category+BOPS+month+year)
summary(model11) ## basic model
##
## Call:
## lm(formula = log_sales ~ store + category + BOPS + month + year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9619 -0.2468 0.0341 0.3215 3.3827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.827797 0.199994 64.141 < 2e-16 ***
## store6 -2.578039 0.058301 -44.219 < 2e-16 ***
## store5998 -3.122738 0.061490 -50.785 < 2e-16 ***
## category2 -1.134516 0.129422 -8.766 < 2e-16 ***
## category3 -0.243113 0.129422 -1.878 0.060597 .
## category4 0.688088 0.129422 5.317 1.29e-07 ***
## category5 0.188185 0.129422 1.454 0.146235
## category6 -1.023648 0.129422 -7.909 6.56e-15 ***
## category7 -2.725687 0.129422 -21.061 < 2e-16 ***
## category8 -4.061074 0.168504 -24.101 < 2e-16 ***
## category9 -4.091243 0.137257 -29.807 < 2e-16 ***
## category10 -5.505318 0.199320 -27.621 < 2e-16 ***
## category11 -0.668600 0.129422 -5.166 2.86e-07 ***
## category12 0.239848 0.129422 1.853 0.064132 .
## category13 -1.129676 0.129422 -8.729 < 2e-16 ***
## category14 -3.013735 0.130850 -23.032 < 2e-16 ***
## category15 -6.844201 0.254349 -26.909 < 2e-16 ***
## category17 -0.789919 0.183319 -4.309 1.79e-05 ***
## category20 -0.748230 0.129422 -5.781 9.78e-09 ***
## category21 -0.115835 0.129422 -0.895 0.370984
## BOPS1 0.172373 0.161582 1.067 0.286314
## monthAUG 0.135458 0.146068 0.927 0.353953
## monthDEC 1.303327 0.189302 6.885 9.96e-12 ***
## monthFEB 0.561070 0.117111 4.791 1.90e-06 ***
## monthJAN 0.210298 0.116821 1.800 0.072122 .
## monthJUL 0.117375 0.116800 1.005 0.315169
## monthJUN 0.007121 0.116800 0.061 0.951397
## monthMAR 0.054861 0.117503 0.467 0.640676
## monthMAY 0.497770 0.117477 4.237 2.46e-05 ***
## monthNOV 0.655282 0.190061 3.448 0.000588 ***
## monthOCT 0.070904 0.192351 0.369 0.712487
## monthSEP 0.018103 0.192359 0.094 0.925041
## year2011 -0.043403 0.165554 -0.262 0.793245
## year2012 0.031796 0.298813 0.106 0.915279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7765 on 1044 degrees of freedom
## Multiple R-squared: 0.8669, Adjusted R-squared: 0.8627
## F-statistic: 206.1 on 33 and 1044 DF, p-value: < 2.2e-16
model12 = lm(log_sales~category+BOPS+Nov+Dec+year+treatment)
summary(model12) ## basic model####################
##
## Call:
## lm(formula = log_sales ~ category + BOPS + Nov + Dec + year +
## treatment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1357 -1.0973 -0.0347 1.1656 3.4876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.77936 0.20772 47.081 < 2e-16 ***
## category2 -1.13452 0.22012 -5.154 3.04e-07 ***
## category3 -0.24311 0.22012 -1.104 0.269654
## category4 0.68809 0.22012 3.126 0.001821 **
## category5 0.18818 0.22012 0.855 0.392797
## category6 -1.02365 0.22012 -4.650 3.73e-06 ***
## category7 -2.72569 0.22012 -12.383 < 2e-16 ***
## category8 -3.35476 0.28514 -11.765 < 2e-16 ***
## category9 -3.86316 0.23325 -16.562 < 2e-16 ***
## category10 -4.21753 0.33525 -12.580 < 2e-16 ***
## category11 -0.66860 0.22012 -3.037 0.002445 **
## category12 0.23985 0.22012 1.090 0.276134
## category13 -1.12968 0.22012 -5.132 3.41e-07 ***
## category14 -3.01854 0.22253 -13.564 < 2e-16 ***
## category15 -5.59003 0.42941 -13.018 < 2e-16 ***
## category17 0.49305 0.30786 1.602 0.109559
## category20 -0.74823 0.22012 -3.399 0.000701 ***
## category21 -0.11583 0.22012 -0.526 0.598841
## BOPS1 0.01588 0.12668 0.125 0.900242
## Nov1 0.59388 0.16646 3.568 0.000376 ***
## Dec1 1.20044 0.16380 7.329 4.62e-13 ***
## year2011 0.09883 0.14027 0.705 0.481251
## year2012 0.27878 0.20500 1.360 0.174143
## treatment1 1.85194 0.09244 20.034 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.321 on 1054 degrees of freedom
## Multiple R-squared: 0.6113, Adjusted R-squared: 0.6028
## F-statistic: 72.06 on 23 and 1054 DF, p-value: < 2.2e-16
model13 = lm(log_sales~category+BOPS+Nov+Dec+year+store+store*BOPS)
summary(model13) ## basic model
##
## Call:
## lm(formula = log_sales ~ category + BOPS + Nov + Dec + year +
## store + store * BOPS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2907 -0.2843 0.0451 0.3297 3.5135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.95076 0.11977 108.127 < 2e-16 ***
## category2 -1.13452 0.12995 -8.731 < 2e-16 ***
## category3 -0.24311 0.12995 -1.871 0.06164 .
## category4 0.68809 0.12995 5.295 1.45e-07 ***
## category5 0.18818 0.12995 1.448 0.14787
## category6 -1.02365 0.12995 -7.877 8.30e-15 ***
## category7 -2.72569 0.12995 -20.975 < 2e-16 ***
## category8 -4.05692 0.16913 -23.987 < 2e-16 ***
## category9 -4.09888 0.13785 -29.735 < 2e-16 ***
## category10 -5.51965 0.20017 -27.575 < 2e-16 ***
## category11 -0.66860 0.12995 -5.145 3.19e-07 ***
## category12 0.23985 0.12995 1.846 0.06521 .
## category13 -1.12968 0.12995 -8.693 < 2e-16 ***
## category14 -3.00942 0.13138 -22.907 < 2e-16 ***
## category15 -6.91177 0.25558 -27.044 < 2e-16 ***
## category17 -0.79015 0.18406 -4.293 1.93e-05 ***
## category20 -0.74823 0.12995 -5.758 1.12e-08 ***
## category21 -0.11583 0.12995 -0.891 0.37292
## BOPS1 -0.20267 0.09675 -2.095 0.03643 *
## Nov1 0.62691 0.09863 6.356 3.08e-10 ***
## Dec1 1.27880 0.09712 13.167 < 2e-16 ***
## year2011 0.16999 0.08404 2.023 0.04337 *
## year2012 0.38600 0.12233 3.155 0.00165 **
## store6 -2.61645 0.08366 -31.276 < 2e-16 ***
## store5998 -3.50583 0.09268 -37.827 < 2e-16 ***
## BOPS1:store6 0.07644 0.11301 0.676 0.49891
## BOPS1:store5998 0.65552 0.12010 5.458 6.00e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7797 on 1051 degrees of freedom
## Multiple R-squared: 0.8649, Adjusted R-squared: 0.8616
## F-statistic: 258.8 on 26 and 1051 DF, p-value: < 2.2e-16
model2 = lm(log_sales~category+time_index+BOPS+month+year+treatment+bridal)
summary(model2) ##remove store
##
## Call:
## lm(formula = log_sales ~ category + time_index + BOPS + month +
## year + treatment + bridal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9523 -1.1035 -0.0167 1.1956 3.1950
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.653241 0.503829 19.160 < 2e-16 ***
## category2 -1.134516 0.219261 -5.174 2.74e-07 ***
## category3 -0.243113 0.219261 -1.109 0.267779
## category4 0.688088 0.219261 3.138 0.001747 **
## category5 0.188185 0.219261 0.858 0.390942
## category6 -1.023648 0.219261 -4.669 3.43e-06 ***
## category7 -2.725687 0.219261 -12.431 < 2e-16 ***
## category8 -3.357232 0.284196 -11.813 < 2e-16 ***
## category9 -3.864169 0.232372 -16.629 < 2e-16 ***
## category10 -4.226614 0.334108 -12.650 < 2e-16 ***
## category11 -0.668600 0.219261 -3.049 0.002351 **
## category12 0.239848 0.219261 1.094 0.274255
## category13 -1.129676 0.219261 -5.152 3.08e-07 ***
## category14 -3.015184 0.221681 -13.601 < 2e-16 ***
## category15 -5.574060 0.428152 -13.019 < 2e-16 ***
## category17 0.493225 0.306656 1.608 0.108050
## category20 -0.748230 0.219261 -3.413 0.000668 ***
## category21 -0.115835 0.219261 -0.528 0.597407
## time_index 0.003542 0.021093 0.168 0.866662
## BOPS1 0.115883 0.273737 0.423 0.672136
## monthAUG 0.130686 0.202084 0.647 0.517973
## monthDEC 1.266980 0.214206 5.915 4.50e-09 ***
## monthFEB 0.571899 0.202831 2.820 0.004899 **
## monthJAN 0.202547 0.207793 0.975 0.329909
## monthJUL 0.088466 0.207726 0.426 0.670285
## monthJUN -0.018745 0.202360 -0.093 0.926216
## monthMAR 0.042668 0.200213 0.213 0.831282
## monthMAY 0.478081 0.200130 2.389 0.017078 *
## monthNOV 0.663810 0.225186 2.948 0.003271 **
## monthOCT 0.065239 0.241762 0.270 0.787330
## monthSEP 0.035723 0.253326 0.141 0.887885
## year2011 -0.044227 0.084606 -0.523 0.601272
## year2012 NA NA NA NA
## treatment1 1.851325 0.092084 20.105 < 2e-16 ***
## bridal1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.316 on 1045 degrees of freedom
## Multiple R-squared: 0.6176, Adjusted R-squared: 0.6059
## F-statistic: 52.74 on 32 and 1045 DF, p-value: < 2.2e-16
model3 = lm(log_sales~time_index+BOPS+month+year+treatment+bridal)
summary(model3) ## remove category
##
## Call:
## lm(formula = log_sales ~ time_index + BOPS + month + year + treatment +
## bridal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6240 -1.0491 0.2087 1.3512 3.7637
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.661921 0.709119 12.215 < 2e-16 ***
## time_index 0.005163 0.031045 0.166 0.867937
## BOPS1 0.113305 0.402785 0.281 0.778532
## monthAUG 0.031862 0.297325 0.107 0.914680
## monthDEC 1.131156 0.315150 3.589 0.000347 ***
## monthFEB 0.506438 0.298469 1.697 0.090031 .
## monthJAN 0.114822 0.305719 0.376 0.707304
## monthJUL 0.002271 0.305623 0.007 0.994073
## monthJUN -0.102867 0.297729 -0.346 0.729784
## monthMAR 0.032296 0.294474 0.110 0.912688
## monthMAY 0.476442 0.294392 1.618 0.105876
## monthNOV 0.646705 0.331448 1.951 0.051302 .
## monthOCT -0.043410 0.355763 -0.122 0.902906
## monthSEP -0.093603 0.372759 -0.251 0.801779
## year2011 -0.068047 0.124368 -0.547 0.584395
## year2012 NA NA NA NA
## treatment1 1.652921 0.133342 12.396 < 2e-16 ***
## bridal1 0.541769 0.147412 3.675 0.000250 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.936 on 1061 degrees of freedom
## Multiple R-squared: 0.1588, Adjusted R-squared: 0.1461
## F-statistic: 12.52 on 16 and 1061 DF, p-value: < 2.2e-16
model4 = lm(log_sales~BOPS+month+year+treatment+bridal)
summary(model4) ## remove time_index
##
## Call:
## lm(formula = log_sales ~ BOPS + month + year + treatment + bridal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6240 -1.0491 0.2087 1.3512 3.7637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.708391 0.459113 18.968 < 2e-16 ***
## BOPS1 0.113305 0.402785 0.281 0.77853
## monthAUG 0.052516 0.364142 0.144 0.88536
## monthDEC 1.172463 0.471969 2.484 0.01314 *
## monthFEB 0.496112 0.291949 1.699 0.08955 .
## monthJAN 0.099332 0.291151 0.341 0.73304
## monthJUL 0.017761 0.291147 0.061 0.95137
## monthJUN -0.092540 0.291147 -0.318 0.75066
## monthMAR 0.027133 0.292785 0.093 0.92618
## monthMAY 0.481605 0.292768 1.645 0.10026
## monthNOV 0.682849 0.473938 1.441 0.14994
## monthOCT -0.012430 0.479579 -0.026 0.97933
## monthSEP -0.067786 0.479586 -0.141 0.88763
## year2011 -0.006087 0.412773 -0.015 0.98824
## year2012 0.123921 0.745075 0.166 0.86794
## treatment1 1.652921 0.133342 12.396 < 2e-16 ***
## bridal1 0.541769 0.147412 3.675 0.00025 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.936 on 1061 degrees of freedom
## Multiple R-squared: 0.1588, Adjusted R-squared: 0.1461
## F-statistic: 12.52 on 16 and 1061 DF, p-value: < 2.2e-16
model5 = lm(log_sales~BOPS+category+Nov+Dec+year+treatment+BOPS*treatment)
summary(model5) ## interaction btw BOPS*treatment w/out bridal
##
## Call:
## lm(formula = log_sales ~ BOPS + category + Nov + Dec + year +
## treatment + BOPS * treatment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2474 -1.0878 0.0203 1.1665 3.4763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.4249 0.2291 41.143 < 2e-16 ***
## BOPS1 0.4514 0.1751 2.577 0.010098 *
## category2 -1.1345 0.2189 -5.183 2.62e-07 ***
## category3 -0.2431 0.2189 -1.111 0.266989
## category4 0.6881 0.2189 3.143 0.001717 **
## category5 0.1882 0.2189 0.860 0.390159
## category6 -1.0236 0.2189 -4.676 3.30e-06 ***
## category7 -2.7257 0.2189 -12.452 < 2e-16 ***
## category8 -3.3528 0.2836 -11.824 < 2e-16 ***
## category9 -3.8701 0.2320 -16.684 < 2e-16 ***
## category10 -4.2378 0.3334 -12.709 < 2e-16 ***
## category11 -0.6686 0.2189 -3.054 0.002312 **
## category12 0.2398 0.2189 1.096 0.273462
## category13 -1.1297 0.2189 -5.161 2.94e-07 ***
## category14 -3.0104 0.2213 -13.603 < 2e-16 ***
## category15 -5.6341 0.4272 -13.188 < 2e-16 ***
## category17 0.4922 0.3062 1.608 0.108181
## category20 -0.7482 0.2189 -3.418 0.000655 ***
## category21 -0.1158 0.2189 -0.529 0.596802
## Nov1 0.6439 0.1661 3.876 0.000113 ***
## Dec1 1.2544 0.1636 7.668 3.96e-14 ***
## year2011 0.1850 0.1416 1.307 0.191452
## year2012 0.3854 0.2060 1.871 0.061671 .
## treatment1 2.2388 0.1419 15.778 < 2e-16 ***
## BOPS1:treatment1 -0.6574 0.1837 -3.579 0.000361 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.313 on 1053 degrees of freedom
## Multiple R-squared: 0.616, Adjusted R-squared: 0.6072
## F-statistic: 70.37 on 24 and 1053 DF, p-value: < 2.2e-16
model51 = lm(log_sales~BOPS+Nov+Dec+year+treatment+BOPS*treatment+bridal)
summary(model51) ## interaction btw BOPS*treatment w/bridal
##
## Call:
## lm(formula = log_sales ~ BOPS + Nov + Dec + year + treatment +
## BOPS * treatment + bridal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8708 -1.0826 0.2006 1.3443 3.8981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.3947 0.2580 32.539 < 2e-16 ***
## BOPS1 0.3297 0.2576 1.280 0.200824
## Nov1 0.7354 0.2444 3.009 0.002683 **
## Dec1 1.2281 0.2407 5.102 3.97e-07 ***
## year2011 0.2240 0.2082 1.076 0.282268
## year2012 0.5158 0.3030 1.702 0.089046 .
## treatment1 1.9834 0.2070 9.581 < 2e-16 ***
## bridal1 0.5429 0.1471 3.690 0.000236 ***
## BOPS1:treatment1 -0.5615 0.2701 -2.079 0.037883 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.933 on 1069 degrees of freedom
## Multiple R-squared: 0.1556, Adjusted R-squared: 0.1492
## F-statistic: 24.62 on 8 and 1069 DF, p-value: < 2.2e-16
model52 = lm(log_sales~BOPS+month+year+treatment+BOPS*treatment+bridal)
summary(model52) ## interaction btw BOPS*treatment w/bridal######################Final Model
##
## Call:
## lm(formula = log_sales ~ BOPS + month + year + treatment + BOPS *
## treatment + bridal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7208 -1.0559 0.2125 1.3450 3.7712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.402221 0.481372 17.455 < 2e-16 ***
## BOPS1 0.487720 0.440480 1.107 0.26844
## monthAUG 0.055157 0.363572 0.152 0.87944
## monthDEC 1.220979 0.471803 2.588 0.00979 **
## monthFEB 0.502194 0.291505 1.723 0.08522 .
## monthJAN 0.106181 0.290712 0.365 0.71500
## monthJUL 0.022026 0.290697 0.076 0.93962
## monthJUN -0.090184 0.290692 -0.310 0.75644
## monthMAR 0.032582 0.292337 0.111 0.91128
## monthMAY 0.482458 0.292308 1.651 0.09913 .
## monthNOV 0.728282 0.473696 1.537 0.12448
## monthOCT -0.009788 0.478828 -0.020 0.98369
## monthSEP -0.066651 0.478833 -0.139 0.88932
## year2011 0.066419 0.413592 0.161 0.87245
## year2012 0.213080 0.745135 0.286 0.77496
## treatment1 1.983370 0.207077 9.578 < 2e-16 ***
## bridal1 0.542434 0.147181 3.685 0.00024 ***
## BOPS1:treatment1 -0.562966 0.270212 -2.083 0.03745 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.933 on 1060 degrees of freedom
## Multiple R-squared: 0.1623, Adjusted R-squared: 0.1488
## F-statistic: 12.08 on 17 and 1060 DF, p-value: < 2.2e-16
model61 = lm(log_sales~BOPS+month+year+treatment+bridal+BOPS*treatment*bridal)
summary(model61) ## interaction btw BOPS*treatment*bridal############################Final Model
##
## Call:
## lm(formula = log_sales ~ BOPS + month + year + treatment + bridal +
## BOPS * treatment * bridal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6892 -1.0952 0.2972 1.2684 3.8130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.512185 0.489114 17.403 < 2e-16 ***
## BOPS1 0.468095 0.456536 1.025 0.30545
## monthAUG 0.055120 0.363416 0.152 0.87947
## monthDEC 1.220662 0.471599 2.588 0.00978 **
## monthFEB 0.502913 0.291384 1.726 0.08465 .
## monthJAN 0.107359 0.290592 0.369 0.71187
## monthJUL 0.021753 0.290574 0.075 0.94034
## monthJUN -0.090575 0.290567 -0.312 0.75532
## monthMAR 0.034665 0.292216 0.119 0.90559
## monthMAY 0.482937 0.292182 1.653 0.09866 .
## monthNOV 0.727909 0.473491 1.537 0.12451
## monthOCT -0.009826 0.478621 -0.021 0.98362
## monthSEP -0.066833 0.478625 -0.140 0.88897
## year2011 0.065239 0.413413 0.158 0.87464
## year2012 0.211379 0.744813 0.284 0.77662
## treatment1 1.842996 0.232210 7.937 5.27e-15 ***
## bridal1 0.048181 0.421483 0.114 0.90901
## BOPS1:treatment1 -0.553304 0.303457 -1.823 0.06854 .
## BOPS1:bridal1 0.085583 0.547811 0.156 0.87588
## treatment1:bridal1 0.654586 0.491818 1.331 0.18349
## BOPS1:treatment1:bridal1 -0.030744 0.650940 -0.047 0.96234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.933 on 1057 degrees of freedom
## Multiple R-squared: 0.1654, Adjusted R-squared: 0.1496
## F-statistic: 10.47 on 20 and 1057 DF, p-value: < 2.2e-16