Niklas Schandry
Here I provide a function to read in SyRi outputs from
nf-plotsv
for plotting.
Running this requires ‘tidyverse’ (dplyr
, dtplyr
, magrittr
, and
vroom
) and the output is designed to be compatible with
gggenomes
for plotting.
This repo also comes with a snapshot that can be used with
renv::restore()
.
The calculation of polygons to draw curves between sequences is directly
lifted from the amazing
GENESPACE
package, but
GENESPACE
is not a dependency.
The files included in data/
for demonstration are the
plotsr
example files.
parse_syri()
was intended to work with the outputs from
nf-plotsv
. Therefore, the
script expects the SyRi output to be named
genomeA_on_genomeB.syri.out
, and will split based on this. There is
no flexibility here.
parse_syri()
has a number of arguments. Most of those are outlined
below with examples:
files: a list of files. These files are expected to: end with `.syri.out`
and follow the naming scheme A_on_B.syri.out
order: a dataframe with a column bin_id , containing the order of genomes
chroms: (optional) list of chromosomes to retain.
spacing: spacing between chromosomes from the same genome (bin_id).
This spacing works the same way as the spacing parameter of
gggenomes: "between sequences in bases (>1) or relative to
longest bin (<1)",which is actually relative to
(longest bin)/sqrt(number of seq_ids).
Default: 0.05
resize_polygons: (logical) should polygons of short links be resized?
Default: TRUE
resize_polygons_size: if polygons are resized, to what fraction of the total length?
Default: 0.003
min_polygon_feat_size: minimum length of links to be resized. Default: 5000
no_polygons: (logical) do not compute polygons.
Default: FALSE, will compute polygons.
verbose: (logical), if TRUE returns some extra information for debugging.
Default: FALSE
parse_syri()
returns a list of data-frames:
$seqs: contains sequenece information, compatible with gggenomes
$links: contains links between sequences, compatible with gggenomes
$polygons: contains polygons that can be plotted via `geom_polygon()`
In this example, genomeA
is col
and genomeB
is ler.
The output from SyRi can be parsed using parse_syri()
(in
functions/parse_syri.R
)
If not installed, I recommend to install
gggenomes
.
renv::install("tidyverse","thackl/gggenomes")
parse_syri()
builds on tidyverse
and uses some special pipes from
magrittr
.
library(tidyverse)
library(gggenomes)
library(magrittr)
source("functions/parse_syri.R") # Contains syri_plot_fills
Data is read using parse_syri()
.
dat <- parse_syri("data/col_on_ler.syri.out",
order = data.frame(bin_id = c("col","ler"))
)
After parsing the data, it is ready for plotting.
The parsed data can be used with gggenomes
geoms, such as geom_seq
,
geom_bin
, geom_link
, etc.
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
geom_link(aes(fill = type),color = NA) +
syri_plot_fills +
ggtitle("Synteny between Col and Ler")
gggenomes::geom_link()
currently draws simple rectangles. An
alternative is to draw sequence relationships using polygons. These
polygons are computed during parsing (unless no_polygons
is set to
TRUE
) and returned in a dataframe in the $polys
slot of the list.
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
#geom_link() +
syri_plot_fills +
ggtitle("Synteny between Col and Ler")
Sometimes, only a subset of chromosomes is relevant. parse_syri()
expects chromosome names to be identical across genomes. If that is the
case, chromosomes can be selected with the chroms
parameter
dat <- parse_syri("data/col_on_ler.syri.out",
order = data.frame(bin_id = c("col","ler")),
chroms = c("Chr1","Chr3")
)
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
#geom_link() +
syri_plot_fills +
ggtitle("Synteny between Col and Ler Chromosomes 1 and 3")
Sometimes, the default spacing between chromosomes may not be optimal.
parse_syri()
follows gggenomes in spacing rules. If spacing is < 1,
it is relative to the longest bin / sqrt(number of sequences), if it is
>= 1 it is base pairs. The default is 0.05 (as for gggenomes)
dat <- parse_syri("data/col_on_ler.syri.out",
order = data.frame(bin_id = c("col","ler")),
spacing = 5000000 # spacing in bp
)
Of course, if the spacing was changed, this also needs to be adjusted in gggenomes:
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links,
spacing = 5000000) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
syri_plot_fills +
ggtitle("Synteny between Col - Ler with 5MB spacing between chromsomes")
4 times the standard spacing:
dat <- parse_syri("data/col_on_ler.syri.out",
order = data.frame(bin_id = c("col","ler")),
spacing = 0.2 # relative spacing
)
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links,
spacing = 0.2) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
syri_plot_fills +
ggtitle("Synteny between Col - Ler, spacing increased 4x")
By default, short syntenic regions larger than 5000 bp are resized to make them visible. Since this does not reflect the original input, this can be disabled:
dat <- parse_syri("data/col_on_ler.syri.out",
order = data.frame(bin_id = c("col","ler")),
resize_polygons = F)
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
syri_plot_fills +
ggtitle("Synteny between Col and Ler without resizing")
Only regions larger than min_polygon_feat_size
are resized (default
5000), this can be modified to also include smaller regions
dat <- parse_syri("data/col_on_ler.syri.out",
order = data.frame(bin_id = c("col","ler")),
resize_polygons = T,
min_polygon_feat_size = 1000)
Naturally, this will create a busier plot.
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
syri_plot_fills +
ggtitle("Synteny between Col and Ler, resizing regions larger than 999bp")
Regions are resized to have a certain length relative to the chromosome,
controlled by resize_polygons_size
, which defaults to 0.003
(0.3%)
of the chromosome length. Changing this parameter will make resized
regions larger or smaller.
dat <- parse_syri("data/col_on_ler.syri.out",
order = data.frame(bin_id = c("col","ler")),
resize_polygons = T,
resize_polygons_size = 0.01)
This will produce wider polygons for resized links.
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
syri_plot_fills +
ggtitle("Synteny between Col and Ler")
Comparing two genomes is nice, but more might be better.
parse_syri()
can handle multiple inputs in one go when those are
provided as a list:
file_list <- list.files("data", full.names = T)
syri_order <- data.frame(bin_id = c("col", "ler", "cvi", "eri"))
dat <- parse_syri(file_list, order = syri_order)
Making a plot from this works the same way of making a plot of only one
comparison. The order of sequences is set via the order
argument to
parse_syri()
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_seq(linewidth = 1) +
geom_bin_label(size=7) +
syri_plot_fills +
ggtitle("Synteny between Col - Ler - Cvi - Eri")
By default, parse_syri()
takes all chromosomes from the same genome
(bin_id
) and puts them on one axis, adding space between them as
needed (See spacing).
Sometimes, it might be useful to have the chromosomes each on their own
coordinate system instead. This can be done by making use of the
chroms
argument to read each chromosome
individually and then combining them. Below is an example for the
col-ler-cvi-eri data used above and included in data/
.
file_list <- list.files("data", full.names = T)
syri_order <- data.frame(bin_id = c("col", "ler", "cvi", "eri"))
chromosomes <- c("Chr1", "Chr2", "Chr3", "Chr4", "Chr5")
dat_tmp <- lapply(chromosomes,
\(chrom) parse_syri(file_list, order = syri_order, chroms = chrom))
# Bind sequences
dat$seqs <- lapply(1:length(chromosomes), \(l) pluck(dat_tmp, l, "seqs")) %>%
bind_rows()
# Create y coordinates for sequences
seq_pos <- left_join(dat$seqs, syri_order %>%
mutate(y = rev(1:length(bin_id))),
by = join_by(bin_id))
# Bind links
dat$links <- lapply(1:length(chromosomes), \(l) pluck(dat_tmp, l, "links")) %>%
bind_rows()
# Bind polygons
dat$polys <- lapply(1:length(chromosomes), \(l) pluck(dat_tmp, l, "polys")) %>%
bind_rows()
# Add seq_id column to polygons, only keep polygons that connect the same chromosome
dat$polys <- dat$polys %>%
mutate(
Chr_grp1 = str_extract_all(link, "Chr[0-9]*", simplify = T)[, 1],
Chr_grp2 = str_extract_all(link, "Chr[0-9]*", simplify = T)[, 2]
) %>%
filter(Chr_grp1 == Chr_grp2) %>%
mutate(seq_id = Chr_grp1)
Note that geom_segment()
should be used to draw chromosomes, since
geom_seq()
would again place the chromosomes onto a single axis.
gggenomes::gggenomes(seqs = dat$seqs,
links = dat$links) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type == "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.6
) +
geom_polygon(
data = dat$polys %>% filter(direct) %>% filter(type != "SYN"),
aes(
x = x,
y = y,
fill = type,
group = link_grp
),
alpha = 0.8
) +
geom_segment(aes(x = 0, xend = length, y=y, yend=y), data = seq_pos) +
geom_bin_label(size=7,
# Avoid overly long extension of x to the left
expand_left = 1e-2,
nudge_left = 5e-3) +
facet_wrap(~seq_id, ncol = 1, scales = "free_x") +
syri_plot_fills +
theme(strip.background = element_rect(fill = "white")) +
ggtitle("Synteny between Col - Ler - Cvi - Eri")
If you encounter any problems, please open an issue.
If you have suggestions for improvement, please open a pull request.