-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathget_one2one_orthologs__Ensembl_API__process_orthologs.r
111 lines (87 loc) · 6.53 KB
/
get_one2one_orthologs__Ensembl_API__process_orthologs.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#####################################
## Robin van der Lee ##
## robinvanderlee AT gmail DOT com ##
############################################################################################################
## Genome-scale detection of positive selection in 9 primates predicts human-virus evolutionary conflicts ##
## Robin van der Lee, Laurens Wiel, Teunis J.P. van Dam, Martijn A. Huynen ##
############################################################################################################
### INPUT FILE
#head -10 one2one_orthologs_primates_20150303-1449.txt
# ENSG00000228913
# ENSG00000275008
# ENSG00000180383 pan_troglodytes ENSPTRG00000013353 ENSPTRP00000022893 nomascus_leucogenys ENSNLEG00000009576 ENSNLEP00000011652 chlorocebus_sabaeus ENSCSAG00000012789 ENSCSAP00000008967 papio_anubis ENSPANG00000016042 ENSPANP00000016887 macaca_mulatta ENSMMUG00000005116 ENSMMUP00000006779 callithrix_jacchus ENSCJAG00000020809 ENSCJAP00000038711
# ENSG00000277657
# ENSG00000162444 pongo_abelii ENSPPYG00000001904 ENSPPYP00000002207 macaca_mulatta ENSMMUG00000002219 ENSMMUP00000002963 chlorocebus_sabaeus ENSCSAG00000000775 ENSCSAP00000017231 papio_anubis ENSPANG00000015220 ENSPANP00000004180 callithrix_jacchus ENSCJAG00000005470 ENSCJAP00000010001 nomascus_leucogenys ENSNLEG00000009377 ENSNLEP00000011417 pan_troglodytes ENSPTRG00000000125 ENSPTRP00000000237 gorilla_gorilla ENSGGOG00000028335 ENSGGOP00000018605
# ENSG00000165583 gorilla_gorilla ENSGGOG00000010688 ENSGGOP00000010422 pongo_abelii ENSPPYG00000020365 ENSPPYP00000022796
# ENSG00000104671 pan_troglodytes ENSPTRG00000020134 ENSPTRP00000044768 gorilla_gorilla ENSGGOG00000013074 ENSGGOP00000012754 pongo_abelii ENSPPYG00000018486 ENSPPYP00000020722 nomascus_leucogenys ENSNLEG00000011906 ENSNLEP00000014459 macaca_mulatta ENSMMUG00000013735 ENSMMUP00000018058 papio_anubis ENSPANG00000008817 ENSPANP00000002958 chlorocebus_sabaeus ENSCSAG00000015839 ENSCSAP00000011920 callithrix_jacchus ENSCJAG00000012338 ENSCJAP00000022619
# ENSG00000101400 pan_troglodytes ENSPTRG00000013400 ENSPTRP00000022970 gorilla_gorilla ENSGGOG00000008508 ENSGGOP00000008318 pongo_abelii ENSPPYG00000010924 ENSPPYP00000012200 nomascus_leucogenys ENSNLEG00000009970 ENSNLEP00000012154 papio_anubis ENSPANG00000016612 ENSPANP00000016827 chlorocebus_sabaeus ENSCSAG00000012284 ENSCSAP00000008479 macaca_mulatta ENSMMUG00000012719 ENSMMUP00000016694 callithrix_jacchus ENSCJAG00000018516 ENSCJAP00000034358
# ENSG00000228691
# ENSG00000196368 pan_troglodytes ENSPTRG00000021910 ENSPTRP00000037622 gorilla_gorilla ENSGGOG00000015735 ENSGGOP00000015349 nomascus_leucogenys ENSNLEG00000008174 ENSNLEP00000009962 macaca_mulatta ENSMMUG00000032455 ENSMMUP00000012438 papio_anubis ENSPANG00000006680 ENSPANP00000009391
# ENSG00000228913
# ENSG00000275008
# ENSG00000180383 pan_troglodytes ENSPTRG00000013353 ENSPTRP00000022893 nomascus_leucogenys ENSNLEG00000009576 ENSNLEP00000011652 chlorocebus_sabaeus ENSCSAG00000012789 ENSCSAP00000008967 papio_anubis ENSPANG00000016042 ENSPANP00000016887 macaca_mulattENSMMUG00000005116 ENSMMUP00000006779 callithrix_jacchus ENSCJAG00000020809 ENSCJAP00000038711
orthologs <- read.table(unz("ensembl78_one2one_orthologs_primates_20150303-1449.txt.zip", "one2one_orthologs_primates_20150303-1449.txt"), header = F, sep = "\t", quote = "", na.strings = "", fill = T)
dim(orthologs)
head(orthologs)
summary(orthologs)
table(table(orthologs[,1]))
### remove last column, which is always empty
orthologs <- orthologs[,-26]
### discard rows where one or more species do not have an ortholog listed
# we search for orthologs to human genes (column 1) in 8 species
# if a species is found, there are 3 columns: species_name, ensembl gene id, ensembl protein id
num.species.required <- 8
num.cols.required <- num.species.required * 3 + 1
keep.discard <- apply(orthologs, MARGIN = 1, function(x){
if(length(which(!is.na(x))) == num.cols.required){
return("keep");
} else {
return("discard");
}
})
keep.discard
table(keep.discard)
orthologs.all.species <- orthologs[keep.discard == "keep",]
dim(orthologs.all.species)
table(table(orthologs.all.species[,1])) # 11170 human genes with 1:1:1:1.. orthologs in all 9 primates
apply(orthologs.all.species, 2, function(x) table(table(x)))
### sort on human ID
orthologs.all.species <- orthologs.all.species[order(orthologs.all.species[,1]),]
head(orthologs.all.species)
### order columns
# biomart approach had this order:
# Ensembl.Gene.ID Chimpanzee.Ensembl.Gene.ID Gibbon.Ensembl.Gene.ID Gorilla.Ensembl.Gene.ID Marmoset.Ensembl.Gene.ID Olive.baboon.Ensembl.Gene.ID Orangutan.Ensembl.Gene.ID Vervet.AGM.Ensembl.Gene.ID Macaque.Ensembl.Gene.ID
table(as.vector(as.matrix(orthologs.all.species[,seq(2,25,3)])))
write.table(t(t(names(table(as.vector(as.matrix(orthologs.all.species[,seq(2,25,3)])))))), quote = F, row.names = F, col.names = F)
species.order <- c("pan_troglodytes",
"nomascus_leucogenys",
"gorilla_gorilla",
"callithrix_jacchus",
"papio_anubis",
"pongo_abelii",
"chlorocebus_sabaeus",
"macaca_mulatta")
species.order
get.ordered.orthologs <- function(x) {
x <- as.vector(as.matrix(x))
# exclude human element
human.id <- x[1]
x <- x[-1]
split.factor <- sort(rep(1:(length(x)/3),3)) # groups for splitting
split.list <- split(x, split.factor)
order.hash <- unlist(lapply(split.list, function(y){ return(which(species.order == y[1])) })) # check which position each list element is supposed to have
sorted.hash <- sort(order.hash) # sorted hash, where the names now contain the correct order of split.list elements
split.list.sorted <- split.list[as.integer(names(sorted.hash))] # sorted list
c(human.id, as.vector(unlist(lapply(split.list.sorted, function(z){ return(z[2]) } ))))
}
orthologs.all.species.ordered.frame <- matrix(nrow = nrow(orthologs.all.species), ncol = 9)
for(i in 1:nrow(orthologs.all.species)){
row <- orthologs.all.species[i,]
orthologs.all.species.ordered.frame[i,] <- get.ordered.orthologs(row)
}
orthologs.all.species.ordered.frame <- as.data.frame(orthologs.all.species.ordered.frame)
names(orthologs.all.species.ordered.frame) <- c("homo_sapiens", species.order)
dim(orthologs.all.species.ordered.frame)
head(orthologs.all.species.ordered.frame)
### write tables
write.table(orthologs.all.species.ordered.frame, file = "ensembl_API_one2one_ortholog_clusters.txt", quote = F, sep = "\t", row.names = F, col.names = T)