-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathAtha_NLRome_OutgroupPlacement.R
173 lines (150 loc) · 6.43 KB
/
Atha_NLRome_OutgroupPlacement.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
## ---------------------------
##
## Script name: Atha_NLRome_OutgroupPlacement.R
##
## Purpose of script: Place outgroup sequences in the NLRome tree, output tree annotations and propagate into clade trees
##
## Author: Daniil Prigozhin
##
## Date Created: 2020-11-02
##
## Copyright (c) Daniil Prigozhin, 2020
## Email: daniilprigozhin@lbl.gov
##
## ---------------------------
##
## Notes:
##
##
## ---------------------------
## load packages
require(tidyverse)
require(tidytree)
require(treeio)
## set working directory
setwd("~/BioInf/ArabidopsisRENSEQ/Phylogeny/")
a<-read.jplace("/Users/prigozhin/BioInf/ArabidopsisRENSEQ/Phylogeny/RAxML_NLRome_237min_Outgroup/EPA_z/epa_result.jplace")
big_table <- left_join(as_tibble(a@phylo),a@placements)%>% filter(!is.na(name))
placement <- big_table
### import NLRome tree rooted in iTOL
placement %>% print(n=500)
tree_pl <- left_join(x,placement)
placement %>% filter(grepl("CRUB",name))->crub_place
placement %>% filter(grepl("ALYR",name))->alyr_place
alyr_place %>% filter(label == "Int11609")
### Print symbol dataset ----
sink("alyr_placement.txt", append = F)
cat("DATASET_SYMBOL
SEPARATOR SPACE
#label is used in the legend table (can be changed later)
DATASET_LABEL A.lyrataPlacement
#dataset color (can be changed later)
COLOR #FFFF00
MAXIMUM_SIZE 10
DATA
")
ii<-1
for (ii in 1:length(alyr_place$label)){
cat(alyr_place[ii,4] %>% unlist())
cat(" 2 10 #ffff00 1 0.25 ")
cat(as.character(alyr_place$name)[ii] %>% unlist())
cat("\n")
}
sink()
#
### Print symbol dataset ----
sink("crub_placement.txt", append = F)
cat("DATASET_SYMBOL
SEPARATOR SPACE
#label is used in the legend table (can be changed later)
DATASET_LABEL C.rubellaPlacement
#dataset color (can be changed later)
COLOR #00FF00
MAXIMUM_SIZE 10
DATA
")
ii<-1
for (ii in 1:length(crub_place$label)){
cat(crub_place[ii,4] %>% unlist())
cat(" 2 10 #00ff00 1 0.75 ")
cat(as.character(crub_place$name)[ii] %>% unlist())
cat("\n")
}
sink()
#
## for every name in placement find an ancestor in partition
placement %>% print(n=400)
cname <- "CRUB_R0I9X3/50-376"
find_clade_for_og <- function(cname){
lines <- placement %>% filter(name == cname) %>% arrange(-like_weight_ratio)
lab <- lines[1,4]
node <- x %>% filter(label %in% lab) %>% select(node) %>% unlist
ancestors <- ancestor(x,node) %>% union(x[node,])
label <- intersect(ancestors,partition) %>% select(label)
if(nrow(label) == 0){return(NA)} else if(nrow(label ==1)){return(unlist(label))}else {return("MULTI")}
}
test <- placement %>% mutate(Clade = lapply(placement$name, find_clade_for_og)%>%unlist)
test %>% print(n=400)
test %>% filter(grepl("CRUB_R0GTC0",name)) %>% select(Clade) %>% unlist
Summary <- test %>% select(Clade,name) %>% distinct %>% group_by(Clade)%>%summarise(Number = n())
Alyr <- test %>% filter(grepl("ALYR",name)) %>% select(Clade,name) %>% distinct %>% group_by(Clade)%>%summarise(Number_alyr = n())
Crub <- test %>% filter(grepl("CRUB",name)) %>% select(Clade,name) %>% distinct %>% group_by(Clade)%>%summarise(Number_crub = n())
All<-left_join(left_join(Summary,Alyr),Crub) %>% print(n=100)
All[54,1]<-"None"
All[is.na(All)]<-0
All %>% print(n=100)
All_clades <- filter(partition, label %ni% All$Clade)%>%select(label) %>% mutate(Clade = label) %>% select (Clade) %>% mutate(Number=0, Number_alyr=0, Number_crub=0) %>% rbind(All)
All_clades %>% arrange(Clade)->All_clades
All_clades %>% print(n=100)
test %>% select(Clade,name) %>% distinct %>% filter(Clade == "Int14015") %>% print(n=300)
Common %>% select(Clade_0) %>% arrange %>% distinct %>% write_delim("~/Desktop/Clade_names.txt", delim = "\t")
All_clades %>% arrange(Clade) %>% write_delim("~/Desktop/Clade_numbers.txt", delim = "\t")
Common %>% filter(Allele ==1, Ecotype== "Athaliana") %>% group_by(Clade_0) %>% summarise(n=n()) %>% write_delim("~/Desktop/Col0_numbers.txt", delim = "\t")
Named_genes<-read_delim("/Users/prigozhin/Box/NLR_binding_site_prediction/analyses/ArabidopsisRENSEQ/Phylogeny/Autoclades_70/AthaKnownGenes.txt", delim = "\t", col_names = c("Gene","CommonName"))
left_join(Common,Named_genes)%>%filter(!is.na(CommonName))%>%select(Clade_0,CommonName)%>% arrange(Clade_0)%>%write_delim("~/Desktop/Repres.txt", delim = "\t")
Common %>% select(Clade_0,Clade) %>% distinct %>% filter(Clade_0 =="Int10969_308")
### for every node, make a list of outgroup sequences placed at or below the node
(placement$label %in% x$label)
x_pl<- left_join(x,placement %>% mutate(OG = name) %>% select(OG,label))
x_pl %>% filter(!is.na(OG)) %>% print(n=200)
placement %>% filter(label == "Int9553")
x %>% filter(label == "Int9553")
clade_og <- vector()
for (clade in partition$label){
nd <- x %>% filter(label == clade)
node_set <- offspring(x,nd$node,tiponly = F, self_include = T) %>% select(node,label)
line <- placement %>% filter(label %in% node_set$label) %>% select(name) %>% distinct() %>% mutate(Clade=clade)
clade_og <- rbind(line,clade_og)
}
clade_og
(clade_og$Clade %in% partition$label)
(clade_og$Clade %in% Summary$Clade)
partition %>% filter(label %in% clade_og$Clade)
partition %>% filter(label %ni% clade_og$Clade)
partition %>% filter(label %ni% clade_og$Clade) %>% select(N_tips) %>% sum() ## 635 tips in initial clades without Capsella or lyrata sequences
partition %>% filter(label %ni% Summary$Clade) %>% select(N_tips) %>% sum() ## 635 tips in initial clades without Capsella or lyrata sequences
clade_og %>% mutate(Name = str_remove(name,"\\/.*$"), Clade = as.factor(Clade)) %>% select(-name) -> export_og
export_og
(export_og$Clade %in% str_remove(Common$Clade_0,"_.*"))
## Use Common table to output outgroup ID's for every node
Common %>% mutate(Clade = str_remove(Common$Clade_0,"_.*")) %>%
select(Clade_0, Clade) %>%
distinct() %>%
left_join(export_og) %>% select(-Clade) -> export_og
export_og %>% mutate(Clade_0 = as.factor(Clade_0)) %>% filter(!is.na(Name))->export_og
for (clade in levels(export_og$Clade_0)){
tips <- export_og %>% filter(Clade_0 ==clade) %>% select(Name)
if (nrow(tips) >0){print(clade)
print(tips)
write_delim(x = as.data.frame(tips),
path = paste0("./Autoclades_70/",clade,".outgroup.txt"),
delim = "\t",quote_escape = "double",
append = F,col_names = F)
}
}
### Import jplace files for refined clades -----
jplace_files <- list.files("Autoclades_70/Outgroup", pattern = ".*jplace",recursive = F,full.names = T)
jplace<- read.jplace(jplace_files[[1]])
jplace@placements
as_tibble(jplace) %>% print(n=2000)
as_tibble(jplace)