-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_get_protein_sequence.R
115 lines (81 loc) · 5.07 KB
/
2_get_protein_sequence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
library(tidyverse)
library(rentrez)
library(biomaRt)
library(EnsDb.Hsapiens.v86)
library(ensembldb)
ensembl = useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
#load ensembl annotations
setwd("/ix/djishnu/Aaron/data")
ens_tbl = read_tsv("/ix/djishnu/Aaron/data/gene_annotations_full.tsv")
# get unique ensembl gene IDs
ens_genes = ens_tbl %>% distinct(gene_id)
# get transcript (primary tx)
filt = listFilters(ensembl)[54, 1]
attr = listAttributes(ensembl)[c(18, 32,3, 1, 22), 1]
tx = getBM(attributes = attr, filters = filt, values = ens_genes$gene_id, mart = ensembl)
# all mane select txs
tx_select = tx %>% dplyr::filter(transcript_mane_select != '')
# get transcript sequences
tx_seq = getBM(attributes = c(attr[3],"coding", "ccds"),
filters = listFilters(ensembl)[56,1],
values = tx_select$ensembl_transcript_id, mart = ensembl)
tx_seq = tx_seq %>% left_join(tx_select, by = "ensembl_transcript_id")
# remove exons (we don't need duplicated rows because of different exon annotations)
tx_seqs_by_pos = (ens_tbl %>% dplyr::select(-entrezid, -tx_biotype, -strand, -exon_id)) %>% distinct()
tx_seqs_by_pos = tx_seqs_by_pos %>% left_join(tx_seq, by = c("gene_id" = "ensembl_gene_id"))
# remove na columns because they aren't annotated with high quality transcripts
tx_seqs_by_pos = tx_seqs_by_pos %>% drop_na()
# we can also remove the MANE select version so that we keep dimensions small
# deleted version # e.g MN_xxx.4 -> MN_xxx
tx_seqs_by_pos = separate(tx_seqs_by_pos, col = "transcript_mane_select", into = c("transcript_mane_select", "mane_version"),
sep = "\\.") %>% dplyr::select(-mane_version)
# saveRDS(tx_seqs_by_pos, file = "/ix/djishnu/Aaron/data/output/MANE_select_tx.RDS")
# tx_seq = readRDS("/ix/djishnu/Aaron/data/output/MANE_select_tx.RDS")
tx_seq = tx_seqs_by_pos
#map coordinate to transcript position
#coordinate ranges
coord_range = GRanges(paste0(tx_seq$chr, ":", tx_seq$coord))
# map genome coordinates to transcript coordinates (this takes a while to run)
gnm_tx = genomeToTranscript(coord_range, EnsDb.Hsapiens.v86)
# saveRDS(gnm_tx, "/ix/djishnu/Aaron/data/output/transcript_ranges.RDS")
# gnm_tx = readRDS("/ix/djishnu/Aaron/data/output/transcript_ranges.RDS")
#unlist GRanges object and convert to dataframe
# main ones need to be removed are exon id/rank because there are multiple mappings of
# different exons to same transcript, so a join creates many, many rows (~2 million)
tx_ranges = as.data.frame(gnm_tx@unlistData) %>%
dplyr::select(-names, -seq_strand, -exon_id, -exon_rank)
# join by chromosomal position (making a column that includes chromosome + position)
tx_ranges$search_coords = paste0(tx_ranges$seq_name, ":", tx_ranges$seq_start, "-", tx_ranges$seq_end)
# join with transcripts found earlier (already selected for high quality)
unique_tx_select = unique(tx_seq$transcript_mane_select)
#get ensembl transcripts associated with these
unique_tx_select_ensembl = (tx_seq %>% dplyr::filter(transcript_mane_select %in% unique_tx_select))$ensembl_transcript_id
# filter returned transcripts
tx_ranges_filtered = tx_ranges %>% dplyr::filter(tx_id %in% unique_tx_select_ensembl)
# these are all the variants (excluding the specific ppis)
txs_joined_ranges = (tx_seq %>% dplyr::select(-protein)) %>%
left_join(tx_ranges_filtered, by = c("ensembl_transcript_id" = "tx_id", "search_coords" = "search_coords"),
keep = F) %>%
dplyr::filter(start != '') %>% distinct() %>% drop_na() %>% dplyr::select(-seq_start, -seq_end, -seq_name)
# saveRDS(txs_joined_ranges, "/ix/djishnu/Aaron/data/output/mapped_transcript_sequences.RDS")
# txs_joined_ranges = readRDS("/ix/djishnu/Aaron/data/mapped_transcript_sequences.RDS")
# get cds positions for each transcript (don't strictly need)
ranges_uncomp = IRanges(start = txs_joined_ranges$start,
width = txs_joined_ranges$width,
names = txs_joined_ranges$ensembl_transcript_id)
cds_pos_from_transcript = transcriptToCds(ranges_uncomp, db = EnsDb.Hsapiens.v86)
# saveRDS(cds_pos_from_transcript,"/ix/djishnu/Aaron/data/output/cds_pos_from_transcript.RDS")
# cds_pos_from_transcript = readRDS("/ix/djishnu/Aaron/data/output/cds_pos_from_transcript.RDS")
# map transcript to protein
protein_from_tx = transcriptToProtein(ranges_uncomp, db = EnsDb.Hsapiens.v86)
# filter for transcripts with cds_ok = T
protein_from_txdf = as.data.frame(protein_from_tx) %>% distinct() %>% dplyr::filter(cds_ok == "TRUE")
# saveRDS(protein_from_tx, "/ix/djishnu/Aaron/data/output/protein_transcript_map.RDS")
names(protein_from_txdf)[1:2] = c("residue_start", "residue_end")
names(protein_from_txdf)[4] = "ensembl_protein_id"
# join everything
fully_joined_ids = (txs_joined_ranges %>% dplyr::select(-width)) %>% left_join(protein_from_txdf,
by = c("ensembl_transcript_id" = "tx_id",
"start" = "tx_start",
"end" = "tx_end"), keep = F)
# saveRDS(fully_joined_ids, "/ix/djishnu/Aaron/data/output/mapped_snp_to_protein.RDS")