EE282 HW4

To download the all chromosomes fasta file into a new folder:

mkdir melanogaster_data
cd melanogaster_data
wget ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/current/fasta/dmel-all-chromosome-r6.24.fasta.gz

To calculate total number of nucleotides, total number of Ns, and total number of all sequences ≤ 100kb and all sequences > 100kb:

module load jje/jjeutils
module load jje/kent

bioawk -c fastx 'length($seq) > 100000{ print ">"$name; print $seq }' dmel-all-chromosome-r6.24.fasta.gz > 100kb.fa

bioawk -c fastx 'length($seq) <= 100000{ print ">"$name; print $seq }' dmel-all-chromosome-r6.24.fasta.gz > 99kb.fa


faSize 100kb.fa > summary_100kb.txt
faSize 99kb.fa > summary_99kb.txt

CDF plot for the whole genome, for all sequences ≤ 100kb, and all sequences > 100kb:


bioawk -c fastx ' { print length($seq) }' dmel-all-chromosome-r6.24.fasta.gz \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nWG_Ctg\t0" } { print "WG_Ctg\t" $1 } ' \
> Whole_genome_sort.txt

bioawk -c fastx ' { print length($seq) }' 100kb.fa \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\n100kb_Ctg\t0" } { print "100kb_Ctg\t" $1 } ' \
> 100kb_sort.txt


bioawk -c fastx ' { print length($seq) }' 99kb.fa  \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\n99kb_Ctg\t0" } { print "99kb_Ctg\t" $1 } ' \
> 99kb_sort.txt

plotCDF2 {Whole_genome,100kb,99kb}_sort.txt /dev/stdout \
| tee WG_v_100_99.png \
| display

CDF Plot

Sequence length distribution and GC content plots for the whole genome, for all sequences ≤ 100kb, and all sequences > 100kb:


bioawk -c fastx ' { print gc($seq) }' dmel-all-chromosome-r6.24.fasta.gz | sort -rn | awk ' BEGIN { print "Assembly\tGC" } { print "WG_Ctg\t" $1 } ' > Whole_genome_gc_sort.txt

bioawk -c fastx ' { print gc($seq) }' 100kb.fa | sort -rn | awk ' BEGIN { print "Assembly\tGC" } { print "WG_Ctg\t" $1 } ' > 100kb_gc_sort.txt

bioawk -c fastx ' { print gc($seq) }' 99kb.fa | sort -rn | awk ' BEGIN { print "Assembly\tGC" } { print "WG_Ctg\t" $1 } ' > 99kb_gc_sort.txt

module load R
R
kb99=read.table("99kb_sort.txt", header=T, dec=".", sep="\t")
kb100=read.table("100kb_sort.txt", header=T, dec=".", sep="\t")
WG=read.table("Whole_genome_sort.txt", header=T, dec=".", sep="\t")
kb99_gc=read.table("99kb_gc_sort.txt", header=T, dec=".", sep="\t")
kb100_gc=read.table("100kb_gc_sort.txt", header=T, dec=".", sep="\t")
WG_gc=read.table("Whole_genome_gc_sort.txt", header=T, dec=".", sep="\t")
library(ggplot2)

png(filename="kb99_size.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb99_plot <- ggplot(data = kb99)
kb99_plot + geom_histogram(mapping = aes(x = Length), bins=10)
dev.off()

png(filename="kb100_size.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb100_plot <- ggplot(data = kb100)
kb100_plot + geom_histogram(mapping = aes(x = Length), bins=10)
dev.off()

png(filename="WG_size.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
WG_plot <- ggplot(data = WG)
WG_plot + geom_histogram(mapping = aes(x = Length), bins=10)
dev.off()


png(filename="kb100_gc.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb100_gc_plot <- ggplot(data = kb100_gc)
kb100_gc_plot + geom_histogram(mapping = aes(x = GC), bins=10)
dev.off()

png(filename="kb99_gc.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb99_gc_plot <- ggplot(data = kb99_gc)
kb99_gc_plot + geom_histogram(mapping = aes(x = GC), bins=10)
dev.off()

png(filename="WG_gc.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
WG_gc_plot <- ggplot(data = WG_gc)
WG_gc_plot + geom_histogram(mapping = aes(x = GC), bins=10)
dev.off()

Whole genome sequence length

>100kb sequence length

<100kb sequence length

Whole genome gc

>100kb gc

<100kb gc

Comments on "Summarize partitions of a genome assembly"

Great job. Please try to put all results in this document. I trust the results redirected from faSize are accurate, but I can't tell for sure since they aren't in the markdown document.

Assemble a genome from MinION reads:

module load jje/jjeutils

n50 () {
  bioawk -c fastx ' { print length($seq); n=n+length($seq); } END { print n; } ' $1 \
  | sort -rn \
  | gawk ' NR == 1 { n = $1 }; NR > 1 { ni = $1 + ni; } ni/n > 0.5 { print $1; exit; } '
}

minimap=$(which minimap)
miniasm=$(which miniasm)
basedir=/pub/jje/ee282/$USER
projname=nanopore_assembly
basedir=$basedir/$projname
raw=$basedir/$projname/data/raw
processed=$basedir/$projname/data/processed
figures=$basedir/$projname/output/figures
reports=$basedir/$projname/output/reports

createProject $projname $basedir
ln -sf /bio/share/solarese/hw4/rawdata/iso1_onp_a2_1kb.fastq $raw/reads.fq

$minimap -t 32 -Sw5 -L100 -m0 $raw/reads.fq{,} \
| gzip -1 \
> $processed/onp.paf.gz

$miniasm -f $raw/reads.fq $processed/onp.paf.gz \
> $processed/reads.gfa

awk ' $0 ~/^S/ { print ">" $2" \n" $3 } ' $processed/reads.gfa \
| tee >(n50 /dev/stdin > $reports/n50.txt) \
| fold -w 60 \
> $processed/unitigs.fa


cp unitig.fa ../../../../melanogaster_data

Assembly assessment

N50 was calculated above, to see results:

less ./nanopore_assembly/nanopore_assembly/output/reports/n50.txt

n50= 4494246

Compare your assembly to the contig assembly from Drosophila melanogaster on FlyBase using Mummer

module load jje/jjeutils 
module load jje/kent
faSplitByN dmel-all-chromosome-r6.24.fasta.gz  dmel-all-chromosome-r6.24.contigs.fasta.gz  10

###Loading of binaries via module load or PATH reassignment
source /pub/jje/ee282/bin/.qmbashrc
module load gnuplot/4.6.0

###Query and Reference Assignment. State my prefix for output filenames
REF="dmel-all-chromosome-r6.24.contigs.fasta"
PREFIX="flybase"
SGE_TASK_ID=1
QRY=$(ls u*.fa | head -n $SGE_TASK_ID | tail -n 1)
PREFIX=${PREFIX}_$(basename ${QRY} .fa)

###please use a value between 75-150 for -c. The value of 1000 is too strict.
nucmer -l 100 -c 100 -d 10 -banded -D 5 -prefix ${PREFIX} ${REF} ${QRY}
mummerplot --fat --layout --filter -p ${PREFIX} ${PREFIX}.delta \
  -R ${REF} -Q ${QRY} --png

Compare your assembly to both the contig assembly and the scaffold assembly from the Drosophila melanogaster on FlyBase using a contiguity plot

module load jje/jjeutils 
module load jje/kent

bioawk -c fastx ' { print length($seq) }' dmel-all-chromosome-r6.24.fasta.gz \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nScaff_Ctg\t0" } { print "Scaff_Ctg\t" $1 } ' \
> Scaffold_sort.txt

bioawk -c fastx ' { print length($seq) }' dmel-all-chromosome-r6.24.contigs.fasta \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nContig_Ctg\t0" } { print "Contig_Ctg\t" $1 } ' \
> Contigs_sort.txt 


bioawk -c fastx ' { print length($seq) }' unitigs.fa \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nMine_Ctg\t0" } { print "Mine_Ctg\t" $1 } ' \
> My_assembly_sort.txt


plotCDF2 {Scaffold,Contigs,My_assembly}_sort.txt /dev/stdout \
| tee Scaf_Contig_Mine_CDF.png \
| display

Calculate BUSCO scores of both assemblies and compare them

Your working directory should contain the assembly fasta you want the BUSCO scores for.

module load augustus/3.2.1
module load blast/2.2.31 hmmer/3.1b2 boost/1.54.0
source /pub/jje/ee282/bin/.buscorc

INPUTTYPE="geno"
MYLIBDIR="/pub/jje/ee282/bin/busco/lineages/"
MYLIB="diptera_odb9"
OPTIONS="-l ${MYLIBDIR}${MYLIB}"
QRY="unitigs.fa"
MYEXT=".fasta" 


#you can change the value after -c to tell busco how many cores to run on. 
BUSCO.py -c 40 -i ${QRY} -m ${INPUTTYPE} -o $(basename ${QRY} ${MYEXT})_${MYLIB}${SPTAG} ${OPTIONS}

##run for contigs assembly

INPUTTYPE="geno"
MYLIBDIR="/pub/jje/ee282/bin/busco/lineages/"
MYLIB="diptera_odb9"
OPTIONS="-l ${MYLIBDIR}${MYLIB}"
QRY="dmel-all-chromosome-r6.24.contig.fasta"
MYEXT=".fasta" 


#you can change the value after -c to tell busco how many cores to run on. 
BUSCO.py -c 40 -i ${QRY} -m ${INPUTTYPE} -o $(basename ${QRY} ${MYEXT})_${MYLIB}${SPTAG} ${OPTIONS}

Hi, you are missing your n50 and busco results here. Please update them.

I updated the n50, but I never got buscos to actually run to completion without errors unfortunately :( I did get a chance to look at another students results though, so I do know what they should look like if I needed to run it in the future. Thanks.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

EE282 HW4.md

EE282 HW4.md

EE282 HW4

Comments on "Summarize partitions of a genome assembly"

Hi, you are missing your n50 and busco results here. Please update them.

I updated the n50, but I never got buscos to actually run to completion without errors unfortunately :( I did get a chance to look at another students results though, so I do know what they should look like if I needed to run it in the future. Thanks.

Files

EE282 HW4.md

Latest commit

History

EE282 HW4.md

File metadata and controls

EE282 HW4

Comments on "Summarize partitions of a genome assembly"

Hi, you are missing your n50 and busco results here. Please update them.

I updated the n50, but I never got buscos to actually run to completion without errors unfortunately :( I did get a chance to look at another students results though, so I do know what they should look like if I needed to run it in the future. Thanks.