Skip to content

Latest commit

 

History

History
309 lines (222 loc) · 8.95 KB

EE282 HW4.md

File metadata and controls

309 lines (222 loc) · 8.95 KB

EE282 HW4

To download the all chromosomes fasta file into a new folder:

mkdir melanogaster_data
cd melanogaster_data
wget ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/current/fasta/dmel-all-chromosome-r6.24.fasta.gz

To calculate total number of nucleotides, total number of Ns, and total number of all sequences ≤ 100kb and all sequences > 100kb:

module load jje/jjeutils
module load jje/kent

bioawk -c fastx 'length($seq) > 100000{ print ">"$name; print $seq }' dmel-all-chromosome-r6.24.fasta.gz > 100kb.fa

bioawk -c fastx 'length($seq) <= 100000{ print ">"$name; print $seq }' dmel-all-chromosome-r6.24.fasta.gz > 99kb.fa


faSize 100kb.fa > summary_100kb.txt
faSize 99kb.fa > summary_99kb.txt

CDF plot for the whole genome, for all sequences ≤ 100kb, and all sequences > 100kb:


bioawk -c fastx ' { print length($seq) }' dmel-all-chromosome-r6.24.fasta.gz \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nWG_Ctg\t0" } { print "WG_Ctg\t" $1 } ' \
> Whole_genome_sort.txt

bioawk -c fastx ' { print length($seq) }' 100kb.fa \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\n100kb_Ctg\t0" } { print "100kb_Ctg\t" $1 } ' \
> 100kb_sort.txt


bioawk -c fastx ' { print length($seq) }' 99kb.fa  \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\n99kb_Ctg\t0" } { print "99kb_Ctg\t" $1 } ' \
> 99kb_sort.txt

plotCDF2 {Whole_genome,100kb,99kb}_sort.txt /dev/stdout \
| tee WG_v_100_99.png \
| display

CDF Plot CDF PLOT

Sequence length distribution and GC content plots for the whole genome, for all sequences ≤ 100kb, and all sequences > 100kb:


bioawk -c fastx ' { print gc($seq) }' dmel-all-chromosome-r6.24.fasta.gz | sort -rn | awk ' BEGIN { print "Assembly\tGC" } { print "WG_Ctg\t" $1 } ' > Whole_genome_gc_sort.txt

bioawk -c fastx ' { print gc($seq) }' 100kb.fa | sort -rn | awk ' BEGIN { print "Assembly\tGC" } { print "WG_Ctg\t" $1 } ' > 100kb_gc_sort.txt

bioawk -c fastx ' { print gc($seq) }' 99kb.fa | sort -rn | awk ' BEGIN { print "Assembly\tGC" } { print "WG_Ctg\t" $1 } ' > 99kb_gc_sort.txt

module load R
R
kb99=read.table("99kb_sort.txt", header=T, dec=".", sep="\t")
kb100=read.table("100kb_sort.txt", header=T, dec=".", sep="\t")
WG=read.table("Whole_genome_sort.txt", header=T, dec=".", sep="\t")
kb99_gc=read.table("99kb_gc_sort.txt", header=T, dec=".", sep="\t")
kb100_gc=read.table("100kb_gc_sort.txt", header=T, dec=".", sep="\t")
WG_gc=read.table("Whole_genome_gc_sort.txt", header=T, dec=".", sep="\t")
library(ggplot2)

png(filename="kb99_size.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb99_plot <- ggplot(data = kb99)
kb99_plot + geom_histogram(mapping = aes(x = Length), bins=10)
dev.off()

png(filename="kb100_size.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb100_plot <- ggplot(data = kb100)
kb100_plot + geom_histogram(mapping = aes(x = Length), bins=10)
dev.off()

png(filename="WG_size.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
WG_plot <- ggplot(data = WG)
WG_plot + geom_histogram(mapping = aes(x = Length), bins=10)
dev.off()


png(filename="kb100_gc.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb100_gc_plot <- ggplot(data = kb100_gc)
kb100_gc_plot + geom_histogram(mapping = aes(x = GC), bins=10)
dev.off()

png(filename="kb99_gc.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
kb99_gc_plot <- ggplot(data = kb99_gc)
kb99_gc_plot + geom_histogram(mapping = aes(x = GC), bins=10)
dev.off()

png(filename="WG_gc.png", 
    units="in", 
    width=5, 
    height=4, 
    pointsize=12, 
    res=72)
WG_gc_plot <- ggplot(data = WG_gc)
WG_gc_plot + geom_histogram(mapping = aes(x = GC), bins=10)
dev.off()

Whole genome sequence length WG_sl

>100kb sequence length 100_sl

<100kb sequence length 99_sl

Whole genome gc WG_gc

>100kb gc 100_gc

<100kb gc 99_gc

Comments on "Summarize partitions of a genome assembly"

Great job. Please try to put all results in this document. I trust the results redirected from faSize are accurate, but I can't tell for sure since they aren't in the markdown document.

Assemble a genome from MinION reads:

module load jje/jjeutils

n50 () {
  bioawk -c fastx ' { print length($seq); n=n+length($seq); } END { print n; } ' $1 \
  | sort -rn \
  | gawk ' NR == 1 { n = $1 }; NR > 1 { ni = $1 + ni; } ni/n > 0.5 { print $1; exit; } '
}

minimap=$(which minimap)
miniasm=$(which miniasm)
basedir=/pub/jje/ee282/$USER
projname=nanopore_assembly
basedir=$basedir/$projname
raw=$basedir/$projname/data/raw
processed=$basedir/$projname/data/processed
figures=$basedir/$projname/output/figures
reports=$basedir/$projname/output/reports

createProject $projname $basedir
ln -sf /bio/share/solarese/hw4/rawdata/iso1_onp_a2_1kb.fastq $raw/reads.fq

$minimap -t 32 -Sw5 -L100 -m0 $raw/reads.fq{,} \
| gzip -1 \
> $processed/onp.paf.gz

$miniasm -f $raw/reads.fq $processed/onp.paf.gz \
> $processed/reads.gfa

awk ' $0 ~/^S/ { print ">" $2" \n" $3 } ' $processed/reads.gfa \
| tee >(n50 /dev/stdin > $reports/n50.txt) \
| fold -w 60 \
> $processed/unitigs.fa


cp unitig.fa ../../../../melanogaster_data

Assembly assessment

N50 was calculated above, to see results:

less ./nanopore_assembly/nanopore_assembly/output/reports/n50.txt

n50= 4494246

Compare your assembly to the contig assembly from Drosophila melanogaster on FlyBase using Mummer

module load jje/jjeutils 
module load jje/kent
faSplitByN dmel-all-chromosome-r6.24.fasta.gz  dmel-all-chromosome-r6.24.contigs.fasta.gz  10

###Loading of binaries via module load or PATH reassignment
source /pub/jje/ee282/bin/.qmbashrc
module load gnuplot/4.6.0

###Query and Reference Assignment. State my prefix for output filenames
REF="dmel-all-chromosome-r6.24.contigs.fasta"
PREFIX="flybase"
SGE_TASK_ID=1
QRY=$(ls u*.fa | head -n $SGE_TASK_ID | tail -n 1)
PREFIX=${PREFIX}_$(basename ${QRY} .fa)

###please use a value between 75-150 for -c. The value of 1000 is too strict.
nucmer -l 100 -c 100 -d 10 -banded -D 5 -prefix ${PREFIX} ${REF} ${QRY}
mummerplot --fat --layout --filter -p ${PREFIX} ${PREFIX}.delta \
  -R ${REF} -Q ${QRY} --png

mummer

Compare your assembly to both the contig assembly and the scaffold assembly from the Drosophila melanogaster on FlyBase using a contiguity plot

module load jje/jjeutils 
module load jje/kent

bioawk -c fastx ' { print length($seq) }' dmel-all-chromosome-r6.24.fasta.gz \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nScaff_Ctg\t0" } { print "Scaff_Ctg\t" $1 } ' \
> Scaffold_sort.txt

bioawk -c fastx ' { print length($seq) }' dmel-all-chromosome-r6.24.contigs.fasta \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nContig_Ctg\t0" } { print "Contig_Ctg\t" $1 } ' \
> Contigs_sort.txt 


bioawk -c fastx ' { print length($seq) }' unitigs.fa \
| sort -rn \
| awk ' BEGIN { print "Assembly\tLength\nMine_Ctg\t0" } { print "Mine_Ctg\t" $1 } ' \
> My_assembly_sort.txt


plotCDF2 {Scaffold,Contigs,My_assembly}_sort.txt /dev/stdout \
| tee Scaf_Contig_Mine_CDF.png \
| display

contigs

Calculate BUSCO scores of both assemblies and compare them

Your working directory should contain the assembly fasta you want the BUSCO scores for.

module load augustus/3.2.1
module load blast/2.2.31 hmmer/3.1b2 boost/1.54.0
source /pub/jje/ee282/bin/.buscorc

INPUTTYPE="geno"
MYLIBDIR="/pub/jje/ee282/bin/busco/lineages/"
MYLIB="diptera_odb9"
OPTIONS="-l ${MYLIBDIR}${MYLIB}"
QRY="unitigs.fa"
MYEXT=".fasta" 


#you can change the value after -c to tell busco how many cores to run on. 
BUSCO.py -c 40 -i ${QRY} -m ${INPUTTYPE} -o $(basename ${QRY} ${MYEXT})_${MYLIB}${SPTAG} ${OPTIONS}

##run for contigs assembly

INPUTTYPE="geno"
MYLIBDIR="/pub/jje/ee282/bin/busco/lineages/"
MYLIB="diptera_odb9"
OPTIONS="-l ${MYLIBDIR}${MYLIB}"
QRY="dmel-all-chromosome-r6.24.contig.fasta"
MYEXT=".fasta" 


#you can change the value after -c to tell busco how many cores to run on. 
BUSCO.py -c 40 -i ${QRY} -m ${INPUTTYPE} -o $(basename ${QRY} ${MYEXT})_${MYLIB}${SPTAG} ${OPTIONS}


Hi, you are missing your n50 and busco results here. Please update them.

I updated the n50, but I never got buscos to actually run to completion without errors unfortunately :( I did get a chance to look at another students results though, so I do know what they should look like if I needed to run it in the future. Thanks.