forked from tcratius/A3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CodeForA3.R
183 lines (149 loc) · 6.55 KB
/
CodeForA3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
library(ggplot2)
library(tidyverse)
library(RColorBrewer)
# Constants
less.than.value <- 4499 # Hours used x-intercept
mu <- 4800 # mean in Hour
sigma <- 400 # sigma
rnorm.axis <- mu * 2 # Number of values to be randomly generated
# The lifetime of a particular type of TV follows a normal distribution with mean =
# 4800 hours and equal std =400 hours.
# (a) Find the probability that a single randomly-chosen TV will last
# less than 4,500 hours. Use R to assist with your computations.
# (b) Find the probability that the mean lifetime of a random sample
# of 16 TVs is less than 4,500 hours. Use R to assist with your computations.
# (c) Compare answers from (a) and (b).
# probability of vectors
# (a) Find the probability that a single randomly-chosen TV will last
# less than 4,500 hours. Use R to assist with your computations.
# So the probability (success) of a TV lasting less than 4,500 hours is;
# https://www.theanalysisfactor.com/understanding-odds-and-probability/
p.aTV <- pnorm(less.than.value, mu, sigma) # Success
p.aTV
# Create random vectors of a normal distribution using mean, standard deviation
# and x end and assign to variable named TV_Life_Time
# and then convert to a data.frame for use with ggplot
TV <- data.frame(x = rnorm(rnorm.axis, mu, sigma), y=rnorm(rnorm.axis, mu, sigma))
# Add to ggplot clustering
nClusters <- 7 # Cluster it
kMeans <- kmeans(TV, centers = nClusters)
TV$Cluster <- as.factor(kMeans$cluster)
plot <- ggplot(TV, aes(x=x, y=y)) +
stat_density2d(aes(fill = Cluster, colour = Cluster,
alpha = ..level..),
geom = "polygon")
# Creat a plot that shows where less than 4,500 sits on a
# normal distribution
plotTV <- ggplot(TV, aes(x=x)) +
geom_density(alpha= 0.2, fill='red', size=1) +
labs(title=('TV Life Time - Normal Distribution')) +
#scale_x_continuous(
# breaks = c(4, 4.5, mean.scaled, 5, 5.5, 6),
#labels = c('4000', '4500', 'Mean', '5000','5500', '6000'),
#name = "TV Life Time (Hours)") +
scale_y_continuous(name = 'Density') + #, labels = scales::percent_format()) +
theme_minimal()
plotTV
# vertical line spliting the area of the normal distribution
# at which the probability become 1 - p
plotTV + geom_vline(xintercept = less.than.value.scaled)
# geom_text(aes(y=1,label=aTV)) +
###
###
### https://stackoverflow.com/questions/3494593/shading-a-kernel-density-plot-between-two-points
###
# function that approximates the density at the provided values
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
probsl <- c(0.00, p.aTV) # lower
#probsu <- c(0.885, 1.00) # upper
#probch <- c(aTV, p.bTV) # change from lower
TV2 <- TV %>%
mutate(dy = approxdens(x), # calculate density
p = percent_rank(x), # percentile rank
pcatl = as.factor(cut(p, breaks = probsl, # percentile category
include.lowest = TRUE))) #based on probs
#pcatch = as.factor(cut(p, breaks = probch, # percentile category
# include.lowest = TRUE))) # based on probs
# lower tail geom_ribbon
ggplot(TV2 , aes(x, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcatl)) +
geom_line() +
labs(title=('TV Life Time - Normal Distribution')) +
scale_x_continuous(
breaks = c(4000, less.than.value, mu, 5000, 5500, 6000),
labels = c('4000', less.than.value, 'Mean', '5000','5500', '6000'),
name = "TV Life Time (Hours)") +
scale_y_continuous(name = 'Density') +
scale_fill_brewer(palette = 4, guide = 'none') +
theme_bw()
###
###
###
# Central limit theorum
# n = 16
# Sampling distribution of the sample mean, we have the population mean 4800
# Sample Error of the mean
# trial histogram first
# (b) Find the probability that the mean lifetime of a random sample
# of 16 TVs is less than 4,500 hours. Use R to assist with your computations.
# Take a random sample of 16 TV's from a sample distribution of the sample mean
# equal to the population mean then find the mean of this subset.
# find standard error of the bTV
# if multiple samples were take then
# the standard error of bTV could be done
bTV <- sample(TV$x, 16, replace=FALSE)
# Find the probability of the sample distribution of the sample mean for n= 16
# using the 16 sampled TV's and samples standard deviation and mean
sd.bTV <- sd(bTV)
mean.bTV <- mean(bTV)
# mean 4800
# sd 400
se <- std/sqrt(1)
z <- (4500-4800)/se
pnorm(z)
SE <- std/sqrt(16)
Z <- (4500- 4800)/SE
pnorm(Z)
p.bTV <- pnorm(less.than.value, meanbTV, sd.bTV, lower.tail = TRUE)
pbinom(1, 16, p.bTV)
probsl <- c(0.00, p.aTV) # lower
#probsu <- c(0.885, 1.00) # upper
probch <- c(p.aTV, p.bTV)
TV2 <- TV %>%
mutate(dy = approxdens(x), # calculate density
p = percent_rank(x), # percentile rank
pcatl = as.factor(cut(p, breaks = probsl, # percentile category lower
include.lowest = TRUE)), #based on probs
pcatch = as.factor(cut(p, breaks = probch, # percentile category change
include.lowest = TRUE))) # based on probs
# pcatl lower tail geom_ribbon
# pcatch deviation from probability of population based on sample 16
ggplot(TV2 , aes(x, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcatl)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcatch)) +
geom_line() +
labs(title=('TV Life Time - Normal Distribution')) +
scale_x_continuous(
breaks = c(4000, less.than.value, mu, 5000, 5500, 6000),
labels = c('4000', less.than.value, 'Mean', '5000','5500', '6000'),
name = "TV Life Time (Hours)") +
scale_y_continuous(name = 'Density') +
scale_fill_brewer(palette = 4, guide = 'none') +
theme_bw()
# Beta endorphins are morphine like substances produced by the body.
# They create a sense of well-being. It has been proposed that Beta
# endorphins increase with exercise. Test this hypothesis using the
# data in beta.csv which has Beta endorphin levels for 10 people
# measured for each person pre- and post exercise. Using this sample,
# test if Beta endorphins increase with exercise. Adopt a 5% risk of
# committing a type I error.
# 1. Enter the data into R.
# 2. Perform some exploratory data analysis procedures.
# 3. Perform an appropriate significance test.
# 4. State any assumptions needed to support the validity of the procedure and where possible comment on the adequacy of these assumptions.
# 5. If you were conducting this experiment, what would you try to do to minimise confounding?
# Hint: use R to assist your calculations.