-
Notifications
You must be signed in to change notification settings - Fork 0
/
Simulation_functions.py
286 lines (247 loc) · 15.9 KB
/
Simulation_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#### ALL PYTHON FUNCTIONS ARE DEFINED IN THIS SCRIPS ####
# Import packages
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import pingouin as pg
#### Experiment for simulating power while comparing two algorithms ####
## Wilcoxon signed-rank test and Paired student t-test ##
## beta distribuion and normal distribution ##
def wilcoxonPower(df, alpha = 0.05, power_threshold = 0.8, testrepeat = 100, simulationrepeat = 10, maxdatasetsize = 150):
#### FUNCTION STARTS
# ''''
# dataframe =
# alpha = is the desired level of wronly
# testrepeat =
# simulationrepeat =
# maxdatasetsize =
# power_threshold =
# ADD ADDITIONAL INFO ABOUT THE FUNCTION
# ''''
#power calculations
#select data for power calculation
#one-to-one comparisons for wilcoxon signed rank test. Algorithms must be different, since we simulate power here
dataparameters = [] #create empty list to save the algorithm sets we will compare
for j in range(len(df)): #loop over all possible algorithms to include them in the comparison
for k in range(len(df)): #loop over all possible algorithms to include them in the comparison
if k>j: #dont allow comparison of similar algorithms (we need to ensure unequal means for power simulation)
algorithm_power = [j,k] #make sets of algorithms which sould be compared to eachother
dataparameters.append(algorithm_power) #add algorithm sets to the list to include them in comparison
# set dataset samplesize from minimum ((to enable statistical testing (so >2 samples at least)), to maximum (as specified in as function input)
samples = list(range(2, maxdatasetsize+1))
# create empty arrays for results savings (power and effect sizes of each algorithm-algorihtm comparison)
totalPower = np.zeros((simulationrepeat, maxdatasetsize+1, 4, len(dataparameters)))
effectSize = np.zeros((simulationrepeat, testrepeat, len(dataparameters)))
# loop over all possible sets of one-to-one algorithm comparison
for algorithms in dataparameters:
# loop over the total simulation to indicate a range of the possible results (mean, minimum and maximum will be calculated for both power and effect size)
for simulation in range(simulationrepeat):
# create empty list / array to save the data within one simulation, only the results per simulation will be saved as a whole
output = np.zeros((testrepeat, maxdatasetsize+1, 12)) #array to host the results from 4 statistical tests (Test statistics (T) and p-values)
pownw = [] #list to calculate the power of the NORMAL DISTRIBUTION and WILCOXON SIGNED RANK TEST
powbw = [] #list to calculate the power of the BETA DISTRIBUTION and WILCOXON SIGNED RANK TEST
pownt = [] #list to calculate the power of the NORMAL DISTRIBUTION and PAIRED t TEST
powbt = [] #list to calculate the power of the BETA DISTRIBUTION and PAIRED t TEST
# loop over all sample sizes (since power is dependent on sample size, we want to know the power per sample size)
for datasetsize in samples:
# loop over the number of repetitions within one simulation to get an accurate estimation
for i in range(testrepeat):
# random sampling from both normal and beta distribution with the emperical data parameters
norms = [np.random.normal(df['mean'][j], df['sd'][j], datasetsize) for j in algorithms] #the normal distribution uses the mean and sd of the empirical data
betas = [np.random.beta(df['alpha'][j], df['beta'][j], datasetsize) for j in algorithms] #the beta distribution uses the alpha and beta parameters of the empirical data
# wilcoxon tests on both normal and beta random sampled data from emperical results, the outputs are the test statistic T and pvalue p
Tnw, pvnw = sp.stats.wilcoxon(*norms)
Tbw, pvbw = sp.stats.wilcoxon(*betas)
# student t-test on both normal and beta random sampled data from emperical results, the outputs are the test statistic T and pvalue p
Tnt, pvnt = sp.stats.ttest_rel(*norms)
Tbt, pvbt = sp.stats.ttest_rel(*betas)
# save test statistics and p values in the previously created dataframe
output[i,datasetsize,0] = Tnw #test statistic normal distribution wilcoxon test
output[i,datasetsize,1] = pvnw #p-value normal distribution wilcoxon test
# save test statistics and p values in the previously created dataframe
output[i,datasetsize,2] = Tbw #test statistic beta distribution wilcoxon test
output[i,datasetsize,3] = pvbw #p-value beta distribution wilcoxon test
# save test statistics and p values in the previously created dataframe
output[i,datasetsize,4] = Tnt #test statistic normal distribution t test
output[i,datasetsize,5] = pvnt #p-value normal distribution t test
# save test statistics and p values in the previously created dataframe
output[i,datasetsize,6] = Tbt #test statistic beta distribution t test
output[i,datasetsize,7] = pvbt #p-value beta distribution t test
# the % p-values bigger than alpha (i.e. ratio false negatives) is calculated assuming there is an existing difference in algorithms
# 4 times since all distribution - statistical test types are counted
if pvnw > alpha: #determine if p is bigger than alpha (since that is a type II error (false negative))
output[i,datasetsize,8]+=1 #count the false negatives
if pvbw > alpha: #determine if p is bigger than alpha (since that is a type II error (false negative))
output[i,datasetsize,9]+=1 #count the false negatives
if pvnt > alpha: #determine if p is bigger than alpha (since that is a type II error (false negative))
output[i,datasetsize,10]+=1 #count the false negatives
if pvbt > alpha: #determine if p is bigger than alpha (since that is a type II error (false negative))
output[i,datasetsize,11]+=1 #count the false negatives
#calculate and save the power of NORMAL DISTRIBUTION and WILCOXON TEST
powernw = 1 - sum(output[:,datasetsize,8,])/testrepeat
pownw.append(powernw)
totalPower[simulation, datasetsize, 0, dataparameters.index(algorithms)] = powernw
#calculate and save the power of BETA DISTRIBUTION and WILCOXON TEST
powerbw = 1 - sum(output[:,datasetsize,9])/testrepeat
powbw.append(powerbw)
totalPower[simulation, datasetsize, 1, dataparameters.index(algorithms)] = powerbw
#calculate and save the power of NORMAL DISTRIBUTION and T TEST
powernt = 1 - sum(output[:,datasetsize,10])/testrepeat
pownt.append(powernt)
totalPower[simulation, datasetsize, 2, dataparameters.index(algorithms)] = powernt
#calculate and save the power of BETA DISTRIBUTION and T TEST
powerbt = 1 - sum(output[:,datasetsize,11])/testrepeat
powbt.append(powerbt)
totalPower[simulation, datasetsize, 3, dataparameters.index(algorithms)] = powerbt
#calculate the effect size of the difference in algorithms
S = sum(range(datasetsize+1)) #calculate the total sum of ranks
T = output[:,datasetsize,2] #extract the test statistics T (favorable rank sum) from all tests with maximum datasetsize
effectSize[simulation, :, dataparameters.index(algorithms)] = ((S-T)/S) - (T/S) #effectsize calculation following Kerby2014
#### FUNCTION ENDS, return and save totalPower, effectSize, dataparameters
return totalPower, effectSize, dataparameters
#### Experiment for simulating the type I error while comparing two algorithms ####
## Wilcoxon signed-rank test and Paired student t-test ##
## beta distribuion and normal distribution ##
def wilcoxonTypeI(df, alpha = 0.05, testrepeat = 100, simulationrepeat = 15, maxdatasetsize = 150):
#select data for power calculation
dataparameters = [] #create empty list to save the algorithm sets we will compare
for j in range(len(df)): #loop over all algorithms and its parameters
algorithm = [j,j] #define algorithm pairs. These should be the same, since we use the "no difference" in algorithms to calulate the type I error
dataparameters.append(algorithm) #add algorithm sets to the list to include them in comparison
# combine the total repetitions of the testrepeat and simulation repeat, since error is calculated as average over all simulations, not devided as in the power simulation
typeIerrorRepeat = testrepeat * simulationrepeat
# define empty dataframe to save all found type I error results
totaltypeI = np.zeros((len(dataparameters), typeIerrorRepeat, 4))
# Simulate the type I error per algorithm set
for algorithms in dataparameters:
algorithmindex = dataparameters.index(algorithms)
#create empty dataframes and lists
output = np.zeros((typeIerrorRepeat, 16))
tInw = []
tIbw = []
tInt = []
tIbt = []
# Repeat the simulations
for i in range(typeIerrorRepeat):
# random sampling from both normal and beta distribution with the emperical data parameters
norms = [np.random.normal(df['mean'][j], df['sd'][j], maxdatasetsize) for j in algorithms]
betas = [np.random.beta(df['alpha'][j], df['beta'][j], maxdatasetsize) for j in algorithms]
# wilcoxon tests on both normal and beta random sampled data from emperical results
Tnw, pvnw = sp.stats.wilcoxon(*norms)
Tbw, pvbw = sp.stats.wilcoxon(*betas)
# student t-test on both normal and beta random sampled data from emperical results
Tnt, pvnt = sp.stats.ttest_rel(*norms)
Tbt, pvbt = sp.stats.ttest_rel(*betas)
# load the test statistics and p-values to the total dataframe
output[i,0] = Tnw
output[i,1] = pvnw
output[i,2] = Tbw
output[i,3] = pvbw
output[i,4] = Tnt
output[i,5] = pvnt
output[i,6] = Tbt
output[i,7] = pvbt
# the p-values smaller than alpha (i.e. ratio false positives) is calculated assuming there is no existing difference in algorithms
if pvnw <= alpha:
output[i,8]+=1
if pvbw <= alpha:
output[i,9]+=1
if pvnt <= alpha:
output[i,10]+=1
if pvbt <= alpha:
output[i,11]+=1
# calculate the ratio of p-values smaller than alpha
typeInw = sum(output[:,8])/i
tInw.append(typeInw)
typeIbw = sum(output[:,9])/i
tIbw.append(typeIbw)
typeInt = sum(output[:,10])/i
tInt.append(typeInt)
typeIbt = sum(output[:,11])/i
tIbt.append(typeIbt)
# assign type I error ratios to dataframe
totaltypeI[algorithmindex, :, 0] = tInw
totaltypeI[algorithmindex, :, 1] = tIbw
totaltypeI[algorithmindex, :, 2] = tInt
totaltypeI[algorithmindex, :, 3] = tIbt
#calculate the minimum, average, and maximal type I error (to include the error)
mu = totaltypeI.mean(axis=0)
mini = totaltypeI.min(axis=0)
maxi = totaltypeI.max(axis=0)
#return the minimum, average, and maximal
return mu, mini, maxi
#### Define plots for Power analysis graphing ####
def powerWilcoxonPlot(minarray, maxarray, meanarray, power_threshold = 0.8):
#define length sample size
samples = range(len(meanarray))
#extract plotdata from arrays
pownw = meanarray[:,0].tolist()
minnw = minarray[:,0].tolist()
maxnw = maxarray[:,0].tolist()
powbw = meanarray[:,1].tolist()
minbw = minarray[:,1].tolist()
maxbw = maxarray[:,1].tolist()
pownt = meanarray[:,2].tolist()
minnt = minarray[:,2].tolist()
maxnt = maxarray[:,2].tolist()
powbt = meanarray[:,3].tolist()
minbt = minarray[:,3].tolist()
maxbt = maxarray[:,3].tolist()
#define error marges
plt.fill_between(samples, minnw, maxnw, color='#D8789B', hatch = '...', alpha=0.1)
plt.fill_between(samples, minbw, maxbw, color='#D81B60', hatch = '...', alpha=0.1)
plt.fill_between(samples, minnt, maxnt, color='#80ADD4', hatch = '...', alpha=0.1)
plt.fill_between(samples, minbt, maxbt, color='#1E88E5', hatch = '...', alpha=0.1)
#define average power
plt.plot(samples, pownw, label='Normal distribution, Wilcoxon signed-rank test', color='#D8789B')
plt.plot(samples, powbw, label='Beta distribution, Wilcoxon signed-rank test', color='#D81B60')
plt.plot(samples, pownt, label='Normal distribution, Paired t-test', color='#80ADD4')
plt.plot(samples, powbt, label='Beta distribution, Paired t-test', color='#1E88E5')
#add threshold of power
plt.hlines(power_threshold, 0, len(samples))
#define axis sizes
plt.xlim(0.0, len(samples))
plt.ylim(0.0, 1.0)
#define axis names
plt.xlabel('Sample size')
plt.ylabel('Power')
plt.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), ncol=1)
#### Define plots for type I error graphing ####
def errorWilcoxonPlot(mu, mini, maxi, alpha = 0.05):
ylim = 0.51
#define length sample size
samples = range(len(mu))
#extract plotdata from arrays
totaltypeInw = mu[:,0].tolist()
mintypeInw = mini[:,0].tolist()
maxtypeInw = maxi[:,0].tolist()
totaltypeIbw = mu[:,1].tolist()
mintypeIbw = mini[:,1].tolist()
maxtypeIbw = maxi[:,1].tolist()
totaltypeInt = mu[:,2].tolist()
mintypeInt = mini[:,2].tolist()
maxtypeInt = maxi[:,2].tolist()
totaltypeIbt = mu[:,3].tolist()
mintypeIbt = mini[:,3].tolist()
maxtypeIbt = maxi[:,3].tolist()
#plot the averaged type I error lines
plt.plot(samples, totaltypeInt, label='Normal distribution, Paired t-test', color='#80ADD4')
plt.plot(samples, totaltypeIbt, label='Beta distribution, Paired t-test', color='#1E88E5')
plt.plot(samples, totaltypeInw, label='Normal distribution, Wilcoxon signed-rank test', color='#D8789B')
plt.plot(samples, totaltypeIbw, label='Beta distribution, Wilcoxon signed-rank test', color='#D81B60')
#define error marges
plt.plot(samples, mintypeInt, maxtypeInt, color='#80ADD4', linewidth=0.5)
plt.plot(samples, mintypeIbt, maxtypeIbt, color='#1E88E5', linewidth=0.5)
plt.plot(samples, maxtypeIbw, label='Minimal and maximal type I error', color='#D81B60', linewidth=0.5)
plt.plot(samples, mintypeIbw, color='#D81B60', linewidth=0.5)
plt.plot(samples, mintypeInw, maxtypeInw, color='#D8789B', linewidth=0.5)
#set plot characteristics
plt.xlim(0.0, len(samples))
plt.ylim(-0.01, ylim)
plt.hlines(alpha, 0, len(samples))
plt.xlabel('Total simulations')
plt.ylabel('Type I error')
plt.legend()
plt.show()