-
Notifications
You must be signed in to change notification settings - Fork 10
/
Correlation.R
144 lines (100 loc) · 6.9 KB
/
Correlation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"
NOTE: First Column is treated as 1 in the Selection of Data:
1 - Please make sure your csv file contains only numeric variables with headers for the code to run.
Column(Instance) 1 Column(Instance) 2 . . . . Column(Instance) n
Row(Variable) 1 (Value) (Value) . . . . (Value)
Row(Variable) 2 (Value) (Value) . . . . (Value)
. . . .
. . . .
. . . .
. . . .
Row(Variable) n (Value) (Value) . . . . (Value)
2 - To run the code, select the whole code and run as source (top right in this window) & enter parameter values in the console below
In this case select:
a- the dataset to work with
b- type of separator in your csv file
c- choose correlation to be calculated between rows(variables) or columns(instances)
d- range of columns for numeric data
e- significance value for correlation - the minimum value for correlation you would like to assign (e.g. 0.05)
f- For row wise calculation of correlation - whether the first column of your dataset contains the row names
3 - After the normalized values are calculated you can view the resulting matrices from the environment window (right) &
they will be exported to your present working directory (location of this RScript) as csv files
Methods are kendall, pearson or spearman
1: Pearson Correlation (Calculates correlation, but requires linear correlation)
2: Spearman Correlation (Correlation using ranks, do not require a linear correlation)
3: Kendalls Tau Correlation (Correlation using ranks, do not require a linear correlation)--> Results of Kendallas Tau and Spearman are
quite similar. But Kendalls Tau: usually smaller values than Spearmans rho correlation. Calculations based on concordant and discordant
pairs. Insensitive to error. P values are more accurate with smaller sample sizes.
For column wise
* Cor_c_Test_k contains the kendall correlation values (exported as Correlation_Col_Kendall.csv)
* Cor_c_Test_p contains the pearson correlation values (exported as Correlation_Col_Pearson.csv)
* Cor_c_Test_s contains the spearman correlation values (exported as Correlation_Col_Spearman.csv)
For row wise
* Cor_r_Test_k contains the kendall correlation values (exported as Correlation_Row_Kendall.csv)
* Cor_r_Test_p contains the pearson correlation values (exported as Correlation_Row_Pearson.csv)
* Cor_r_Test_s contains the spearman correlation values (exported as Correlation_Row_Spearman.csv)
"
cat("\f") # Clear old outputs
rm(list=ls()) # Clear all variables
#------------------------------------------------
"SELECTION OF DATSET AND PARAMETERS"
#------------------------------------------------
rm(list=ls())
#User input for data
print(paste("Please select Input CSV", " The different samples in columns and the measured variables in the rows."), quote = FALSE)
fname <- file.choose()
#ask user for type of Separator:
ask_sep <- as.character(readline(prompt = "ENTER either of the types of Separator ',' or ';' : "))
#ask user for correlation to be calculated between columns or rows :
ask_type <- as.character(readline(prompt = "ENTER 'r' for rows or 'c' for columns : "))
#Input file read
file1 <- read.csv(fname, sep=ask_sep)
cat("\f") # Clear old outputs
#Extract continuous variables:
start_num <- as.integer(readline(prompt = "Enter value for START of range of numerical variable: "))
cat("\f") # Clear old outputs
end_num <- as.integer(readline(prompt = "Enter value for END of range of numerical variable: "))
#ask user for significant value of correlation :
sig_p <- as.numeric(readline(prompt = "significance value for correlation - the minimum value for correlation you would like to assign (e.g. 0.05) : "))
#Sub space the numeric data frame:
data_csv <- file1[,start_num : end_num] #all cont. variables
cat("\f") # Clear old outputs
#--------------------------------------------------
"Calculating correlation"
#--------------------------------------------------
# use = "complete.obs" when you have missing values
# use = "all.obs" when you DO NOT have missing values
if (ask_type == 'c'){
Cor_c_Test_k <- cor(data_csv, method = "kendall", use = "complete.obs")
Cor_c_Test_p <- cor(data_csv, method = "pearson", use = "complete.obs")
Cor_c_Test_s <- cor(data_csv, method = "spearman", use = "complete.obs")
Cor_c_Test_k <- as.data.frame(Cor_c_Test_k)
Cor_c_Test_p <- as.data.frame(Cor_c_Test_p)
Cor_c_Test_s <- as.data.frame(Cor_c_Test_s)
Cor_c_Test_k <- replace(Cor_c_Test_k, Cor_c_Test_k < sig_p, "No_correlation")
Cor_c_Test_p <- replace(Cor_c_Test_p, Cor_c_Test_p < sig_p, "No_correlation")
Cor_c_Test_s <- replace(Cor_c_Test_s, Cor_c_Test_s < sig_p, "No_correlation")
write.csv(as.matrix(Cor_c_Test_k), file = "Correlation_Col_Kendall.csv", row.names = TRUE)
write.csv(as.matrix(Cor_c_Test_p), file = "Correlation_Col_Pearson.csv", row.names = TRUE)
write.csv(as.matrix(Cor_c_Test_s), file = "Correlation_Col_Spearman.csv", row.names = TRUE)
} else if (ask_type == 'r') {
ask_row_names <- as.character(readline(prompt = "Does the first variable column in your dataset contain rownames (Y or N)? : "))
if (ask_row_names == 'Y') {
rownames(data_csv) <- file1[,1]
}
data_csv2 <- t(data_csv)
Cor_r_Test_k <- cor(data_csv2, method = "kendall", use = "complete.obs")
Cor_r_Test_p <- cor(data_csv2, method = "pearson", use = "complete.obs")
Cor_r_Test_s <- cor(data_csv2, method = "spearman", use = "complete.obs")
Cor_r_Test_k <- as.data.frame(Cor_r_Test_k)
Cor_r_Test_p <- as.data.frame(Cor_r_Test_p)
Cor_r_Test_s <- as.data.frame(Cor_r_Test_s)
Cor_r_Test_k <- replace(Cor_r_Test_k, Cor_r_Test_k < sig_p, "No_correlation")
Cor_r_Test_p <- replace(Cor_r_Test_p, Cor_r_Test_p < sig_p, "No_correlation")
Cor_r_Test_s <- replace(Cor_r_Test_s, Cor_r_Test_s < sig_p, "No_correlation")
write.csv(as.matrix(Cor_r_Test_k), file = "Correlation_Row_Kendall.csv", row.names = TRUE)
write.csv(as.matrix(Cor_r_Test_p), file = "Correlation_Row_Pearson.csv", row.names = TRUE)
write.csv(as.matrix(Cor_r_Test_s), file = "Correlation_Row_Spearman.csv", row.names = TRUE)
}
cat("\f") # Clear old outputs
print(paste("FINISHED"), quote = FALSE)