-
Notifications
You must be signed in to change notification settings - Fork 0
/
Feature_matching.R
128 lines (88 loc) · 4.76 KB
/
Feature_matching.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Match features from metabolomics datasets
# Get common features between CS and HCC datasets and use this as a starting point for analysis
CS_HCC_match <- function(RTtol = 0.1, study = c("cs", "hcc"), mode = c("pos", "neg"), filt = NULL) {
library(haven)
library(tidyverse)
library(fuzzyjoin)
# Get CS data and HCC data (feature names only)
# Pos or neg mode for CS and HCC
if(mode == "pos") {
cs <- read_tsv("data/EPIC Cross sectional RP POS Feature table.txt", skip=4) %>% select(1) %>% mutate(CS_feat = 1:n())
hcc <- read_csv("data/EPIC liver cancer 2016 RP POS feature table.csv") %>%
select(1) %>% slice(-(83:84)) %>% mutate(HCC_feat = 1:n())
} else if (mode == "neg") {
cs <- read_tsv("data/EPIC Cross sectional RP NEG Feature table.txt", skip=4) %>% select(1) %>% mutate(CS_feat = 1:n())
hcc <- read_csv("data/EPIC liver cancer 2016 RP Neg Feature Table.csv") %>%
select(1) %>% mutate(HCC_feat = 1:n())
}
# Filter by the ordered features
if(!is.null(filt)) cs <- cs[sort(filt), ]
# Join features
csfeat <- cs %>% separate(Compound, into = c("Mass", "RT"), sep = "@", convert = T)
hccfeat <- hcc %>% separate(Compound, into = c("Mass", "rt"), sep = "@", convert = T)
output <- difference_inner_join(csfeat, hccfeat, max_dist = 0.005, distance_col = "massdiff") %>%
filter(abs(RT - rt) < RTtol) #%>% arrange(CS.feat)
return(output)
# 2922 features matched in pos and 1243 in neg mode
# Get unique vector of CSS features that matched. These are then used in Intake_correlation to filter starting FTs
if(study == "cs") v1 <- unique(joindf$CS_feat) else if (study == "hcc") v1 <- unique(joindf$HCC_feat)
}
# Outputs matched df for pos and neg
matched_pos <- CS_HCC_match(mode = "pos")
matched_neg <- CS_HCC_match(mode = "neg")
# Extract vectors of matched features
CSpos <- unique(matched_pos$CS.feat)
CSneg <- unique(matched_neg$CS.feat)
HCCpos <- unique(matched_pos$HCC.feat)
HCCneg <- unique(matched_neg$HCC.feat)
# Filtered by those matched features that are significant in CS only
filt1 <- alcpos_cs$feat
filt2 <- alcneg_cs$feat
matched_pos_filt <- CS_HCC_match(mode = "pos", filt = filt1)
matched_neg_filt <- CS_HCC_match(mode = "neg", filt = filt2)
# ----
# Use vectors to subset peak tables and run partial correlation function
# Now functions from Intake_correlation scripts, CS and HCC. Output is assigned alcpos.cs etc
# ----
# Pos mode: subset Pcor, pval and feature number and join datasets
alcpos.cs1 <- alcpos.cs %>% select(Pcor : CS.feat)
alcpos.hcc1 <- alcpos.hcc %>% select(Pcor : HCC.feat)
pos.match1 <- matched_pos %>% left_join(alcpos.cs1, by = "CS.feat") %>%
left_join(alcpos.hcc1, by = "HCC.feat")
# Plot correlations
plot(pos.match1$Pcor.x, pos.match1$Pcor.y, xlab = "Correlation CS", ylab = "Correlation HCC")
# Neg mode
alcneg.cs1 <- alcneg.cs %>% select(Pcor : CS.feat)
alcneg.hcc1 <- alcneg.hcc %>% select(Pcor : HCC.feat)
neg.match1 <- matched_neg %>% left_join(alcneg.cs1, by = "CS.feat") %>%
left_join(alcneg.hcc1, by = "HCC.feat")
# Plot correlations
plot(neg.match1$Pcor.x, neg.match1$Pcor.y, xlab = "Correlation CS", ylab = "Correlation HCC")
# Plot all
library(ggplot2)
library(ggrepel)
all.match <- bind_rows(pos.match1, neg.match1, .id = "mode") %>%
mutate(mz = ifelse(Pcor.x > 0.2 & Pcor.y > 0.2, paste(Mass.x, round(RT, 2), sep = "@"), NA),
pointcol= ifelse(Pcor.x > 0.2 & Pcor.y > 0.2, "A", "B"))
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
all.match %>%
ggplot(aes(x = Pcor.x, y = Pcor.y, shape = mode, colour = pointcol)) + geom_point() + theme_bw() +
scale_color_manual(values = c("black", "darkgrey")) +
xlab("Partial correlation cross-sectional") + ylab("Partial correlation HCC") +
geom_hline(yintercept = 0, linetype = "dashed") + geom_vline(xintercept = 0, linetype = "dashed") +
geom_text_repel(aes(label = mz), size = 3, direction = "both", nudge_x = 0, point.padding = 0.4,
segment.size = 0.25, force = 0.2) +
theme(legend.position = "none") +
scale_x_continuous(expand = c(0,0), limits = c(0, 0.5)) +
scale_y_continuous(expand = c(0,0), limits = c(0, 0.5))
# ----
# loop to get tolerance vs feature numbers
tol <- seq(0.05, 0.15, by = 0.005)
output <- numeric(length(tol))
for(i in 1:length(tol)) {
output[i] <-
difference_inner_join(csfeat, hccfeat, max_dist = 0.005, distance_col = "massdiff") %>%
filter(abs(RT - rt) < tol[i]) %>% nrow()
}
plot(tol, output, type = "o", xlab = "Retention time tolerance", ylab = "No. features matched")