-
Notifications
You must be signed in to change notification settings - Fork 15
/
gene_count_processing_sciRNAseq.R
81 lines (76 loc) · 3.75 KB
/
gene_count_processing_sciRNAseq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# define the report folder from sci-RNA-seq pipeline
report_folder ="~/Projects/nobackup/180826_drosophila_ciona_test//output_dro/report"
# define the output folder for output the df_cell, df_gene and gene_count matrix
output_folder = "~/Projects/nobackup/180826_drosophila_ciona_test//output_dro/report/"
suppressMessages(library(Matrix))
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
combine_exon_intron <- function (df_gene, gene_count)
{
gene_count_exon = gene_count[df_gene$exon_intron == "exon",
]
gene_count_intron = gene_count[df_gene$exon_intron == "intron",
]
if (nrow(gene_count_exon) == nrow(gene_count_intron)) {
gene_count_combine = gene_count_exon + gene_count_intron
}
else {
gene_count_combine = gene_count_exon[-nrow(gene_count_exon),
] + gene_count_intron
gene_count_combine = rbind(gene_count_combine, gene_count_exon[nrow(gene_count_exon),
])
}
return(gene_count_combine)
}
sciRNAseq_gene_count_summary <- function (gene_count_folder) {
gene_matrix = paste(gene_count_folder, "/count.MM", sep = "")
df_gene = paste(gene_count_folder, "/gene_name_annotate.txt",
sep = "")
df_cell = paste(gene_count_folder, "/cell_annotate.txt",
sep = "")
df_report = paste(gene_count_folder, "/report.MM", sep = "")
report_annotate = paste(gene_count_folder, "/report_annotate.txt",
sep = "")
df_gene = read.csv(df_gene, header = F)
df_cell = read.csv(df_cell, header = F)
gene_matrix = read.csv(gene_matrix, header = F)
colnames(df_gene) = c("gene_id", "gene_type", "exon_intron",
"gene_name", "index")
colnames(df_cell) = c("sample", "index")
rownames(df_gene) = df_gene$gene_id
rownames(df_cell) = df_cell$cell_name
gene_count = sparseMatrix(i = gene_matrix$V1, j = gene_matrix$V2,
x = gene_matrix$V3)
df_gene = df_gene[1:nrow(gene_count), ]
rownames(gene_count) = df_gene$gene_id
colnames(gene_count) = df_cell$cell_name
gene_count = combine_exon_intron(df_gene, gene_count)
df_gene = df_gene %>% filter(exon_intron == "exon")
reportMM = read.csv(df_report, header = F)
df_report = sparseMatrix(i = reportMM$V1, j = reportMM$V2,
x = reportMM$V3)
df_report = as.matrix(t(df_report))
df_report_annotate = read.csv(report_annotate, header = F)
colnames(df_report) = df_report_annotate$V2
df_report = data.frame(df_report)
df_report["index"] = as.numeric(rownames(df_report))
df_cell_combine = inner_join(df_cell, df_report, by = "index")
df_cell_combine["all_exon"] = df_cell_combine$X.Perfect.intersect.exon.match +
df_cell_combine$X.Nearest.intersect.exon.match + df_cell_combine$X.Perfect.combine.exon.match +
df_cell_combine$X.Nearest.combine.exon.match
df_cell_combine["all_intron"] = df_cell_combine$X.Perfect.intersect.gene.match +
df_cell_combine$X.Nearest.intersect.gene.match + df_cell_combine$X.Perfect.combine.gene.match +
df_cell_combine$X.Nearest.combine.gene.match
df_cell_combine["all_reads"] = df_cell_combine$all_exon +
df_cell_combine$all_intron + df_cell_combine$X.No.match
df_cell_combine["unmatched_rate"] = df_cell_combine$X.No.match/df_cell_combine$all_reads
df_cell = df_cell_combine %>% select(sample, unmatched_rate)
df_cell$UMI_count = df_cell_combine$all_exon + df_cell_combine$all_intron
df_gene = df_gene %>% select(gene_id, gene_type, gene_name)
return(list(df_cell, df_gene, gene_count))
}
result = sciRNAseq_gene_count_summary(paste0(report_folder, "/human_mouse_gene_count/"))
df_cell = result[[1]]
df_gene = result[[2]]
gene_count = result[[3]]
save(df_cell, df_gene, gene_count, file = paste0(output_folder, "/sci_summary.RData"))