-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_getHomolog_Prot_Seq.R
133 lines (96 loc) · 4.65 KB
/
3_getHomolog_Prot_Seq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
library(tidyverse)
library(rentrez)
library(biomaRt)
library(EnsDb.Hsapiens.v86)
library(ensembldb)
library(Biostrings)
ensembl = useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
#load ensembl annotations
setwd("/ix/djishnu/Aaron/data")
data = readRDS("/ix/djishnu/Aaron/data/output_data/mapped_snp_to_protein.RDS") %>% drop_na()
names(data)[which(names(data) == "start")] = "tx_start"
names(data)[which(names(data) == "end")] = "tx_end"
# joining cds with tx coordinates for mapping
cds_pos = readRDS("/ix/djishnu/Aaron/data/output_data/cds_pos_from_transcript.RDS") %>% as.data.frame() %>% distinct()
names(cds_pos)[1:2] = c("cds_start", "cds_end")
data = (data %>% dplyr::select(-width)) %>% left_join(cds_pos, by = c("ensembl_transcript_id" = "names",
"tx_start" = "tx_start",
"tx_end" = "tx_end"))
# getting peptide seqs
unique_proteins = unique(data$ensembl_protein_id)
# map these for seqs
protein_seqs = getBM(attributes = c("peptide", "ensembl_peptide_id"), filters = "ensembl_peptide_id",
values = unique_proteins, mart = ensembl)
data = data %>% left_join(protein_seqs, by = c("ensembl_protein_id" = "ensembl_peptide_id"))
# remove asterisk as end of protein sequences
data = separate(data, col = "peptide", sep = "\\*", into = c("peptide", "asterisk")) %>% dplyr::select(-asterisk)
# function to put SNP into transcript
substitute_base = function(rw){
tx = Biostrings::DNAString(rw$coding)
# make edit
if(tx[rw$tx_start] == rw$ref){
tx[rw$tx_start] = rw$var
rw$mut_tx = toString(tx)
}
return(rw)
}
# the index, mut_txs, mut_prots all have the same order, so I just used
# this index df to hold the additional data (gene id,..) because the biostring
# objects are weird
index = data %>% dplyr::select(coding, cds_start, residue_start, ref, var,
ensembl_protein_id, peptide, gene_id, ensembl_transcript_id,
chr, coord)
# saveRDS(index, "/ix/djishnu/Aaron/data/output_data/mutant_sequence_index.RDS")
mut_txs = sapply(index$coding, Biostrings::DNAString)
mut_prots = sapply(index$peptide, Biostrings::AAString)
# make change in each rna transcript and translate to protein
for(i in 1:length(mut_txs)){
sq = mut_txs[[i]]
pos = index[i,]$cds_start
letter = DNAString(index[i,]$var)
if(length(letter) > 1){
inds = seq(pos, pos+length(letter) - 1, by = 1)
replaceLetterAt(sq, at = inds, letter = letter)
} else {
replaceLetterAt(sq, at = pos, letter = letter)
}
mut_txs[[i]] = sq
mut_prots[[i]] = translate(sq, if.fuzzy.codon = "solve")
}
names(mut_prots) = index$ensembl_protein_id
# saveRDS(index, "/ix/djishnu/Aaron/data/index_file.RDS")
# saveRDS(mut_txs, "/ix/djishnu/Aaron/data/mutant_transcripts.RDS")
# saveRDS(mut_prots, "/ix/djishnu/Aaron/data/output/mutant_proteins.RDS")
# index = readRDS("/ix/djishnu/Aaron/data/output/index_file.RDS")
protein_names = unique(index$ensembl_protein_id)
mut_prots_AAset = AAStringSet(mut_prots)
# convert biostrings object to df
mut_prots_df = data.frame(mut_prots_AAset@ranges)
mut_prots_df$peptide = unlist(lapply(mut_prots_AAset, Biostrings::toString))
mut_prots_df$tx_id = index$ensembl_transcript_id
mut_prots_df$chr = index$chr
mut_prots_df$coord = index$coord
mut_prots_df$res_no = index$residue_start
# remove asterisk again after translating
mut_prots_df = separate(mut_prots_df, col = "peptide", sep = "\\*", into = c("peptide", "asterisk")) %>% dplyr::select(-asterisk)
# saveRDS(mut_prots_df, "/ix/djishnu/Aaron/data/output/mutant_protein_sequences.RDS")
# iterate through each protein and get AA sequence and save to fasta file in folder
for(i in 1:length(protein_names)) {
# get proteins
same_protein = mut_prots_df %>% dplyr::filter(names == protein_names[i])
# write these to folder
protein_dir = paste0("/ix/djishnu/Aaron/data/fastas_clustal/protein_fastas/", protein_names[i])
if(!file.exists(protein_dir)){
dir.create(protein_dir)
}
protein_string_set = AAStringSet(same_protein$peptide)
names(protein_string_set) = paste0(same_protein$names, "_", same_protein$tx_id, "_",
same_protein$chr, "_", same_protein$coord, "_",
i)
# write mutated seqs with name ENS_protein#_ENS_transcript#
writeXStringSet(protein_string_set, paste0(protein_dir, "/", protein_names[i],".fasta"), format = "fasta")
# write reference seq
ref_pep = AAStringSet(index$peptide[i])
names(ref_pep) = paste0(protein_names[i], "_ref")
writeXStringSet(ref_pep, paste0(protein_dir, "/", protein_names[i],"_ref.fasta"), format = "fasta")
}