diff --git a/EDTA.pl b/EDTA.pl index 8bad5ae..a42b384 100755 --- a/EDTA.pl +++ b/EDTA.pl @@ -6,7 +6,7 @@ use Getopt::Long; use Pod::Usage; -my $version = "v2.0.0"; +my $version = "v2.0.1"; #v1.0 05/31/2019 #v1.1 06/05/2019 #v1.2 06/16/2019 @@ -33,27 +33,27 @@ structure-based TE library. Usage: perl EDTA.pl [options] - --genome [File] The genome FASTA + --genome [File] The genome FASTA file. Required. --species [Rice|Maize|others] Specify the species for identification of TIR candidates. Default: others - --step [all|filter|final|anno] Specify which steps you want to run EDTA. + --step [all|filter|final|anno] Specify which steps you want to run EDTA. all: run the entire pipeline (default) filter: start from raw TEs to the end. final: start from filtered TEs to finalizing the run. anno: perform whole-genome annotation/analysis after TE library construction. - --overwrite [0|1] If previous raw TE results are found, decide to overwrite + --overwrite [0|1] If previous raw TE results are found, decide to overwrite (1, rerun) or not (0, default). - --cds [File] Provide a FASTA file containing the coding sequence (no introns, + --cds [File] Provide a FASTA file containing the coding sequence (no introns, UTRs, nor TEs) of this genome or its close relative. - --curatedlib [File] Provided a curated library to keep consistant naming and + --curatedlib [File] Provided a curated library to keep consistant naming and classification for known TEs. TEs in this file will be trusted 100%, so please ONLY provide MANUALLY CURATED ones. This option is not mandatory. It's totally OK if no file is provided (default). - --sensitive [0|1] Use RepeatModeler to identify remaining TEs (1) or not (0, + --sensitive [0|1] Use RepeatModeler to identify remaining TEs (1) or not (0, default). This step is slow but MAY help to recover some TEs. - --anno [0|1] Perform (1) or not perform (0, default) whole-genome TE annotation + --anno [0|1] Perform (1) or not perform (0, default) whole-genome TE annotation after TE library construction. --rmout [File] Provide your own homology-based TE annotation instead of using the EDTA library for masking. File is in RepeatMasker .out format. This @@ -62,16 +62,19 @@ --evaluate [0|1] Evaluate (1) classification consistency of the TE annotation. (--anno 1 required). Default: 0. This step is slow and does not change the annotation result. - --exclude [File] Exclude bed format regions from TE annotation. Default: undef. + --exclude [File] Exclude bed format regions from TE annotation. Default: undef. (--anno 1 required). --force [0|1] When no confident TE candidates are found: 0, interrupt and exit (default); 1, use rice TEs to continue. + --u [float] Neutral mutation rate to calculate the age of intact LTR elements. + Intact LTR age is found in this file: *EDTA_raw/LTR/*.pass.list. + Default: 1.3e-8 (per bp per year, from rice). --repeatmodeler [path] The directory containing RepeatModeler (default: read from ENV) --repeatmasker [path] The directory containing RepeatMasker (default: read from ENV) --check_dependencies Check if dependencies are fullfiled and quit - --threads|-t [int] Number of theads to run this script (default: 4) - --debug [0|1] Retain intermediate files (default: 0) - --help|-h Display this help info + --threads|-t [int] Number of theads to run this script (default: 4) + --debug [0|1] Retain intermediate files (default: 0) + --help|-h Display this help info \n"; # pre-defined @@ -88,6 +91,7 @@ my $evaluate = 0; #1 will evaluate the consistancy of the TE annotation my $exclude = ''; #a bed file exclude from TE annotation my $force = 0; #if there is no confident TE found in EDTA_raw, 1 will use rice TEs as raw lib, 0 will error and interrupt. +my $miu = 1.3e-8; #mutation rate, per bp per year, from rice my $threads = 4; my $script_path = $FindBin::Bin; my $EDTA_raw = "$script_path/EDTA_raw.pl"; @@ -150,6 +154,7 @@ 'evaluate=i' => \$evaluate, 'exclude=s' => \$exclude, 'force=i' => \$force, + 'u=s' => \$miu, 'repeatmodeler=s' => \$repeatmodeler, 'repeatmasker=s' => \$repeatmasker, 'tesorter=s' => \$TEsorter, @@ -185,6 +190,7 @@ if ($anno != 0 and $anno != 1){ die "The expected value for the anno parameter is 0 or 1!\n"} if ($evaluate != 0 and $evaluate != 1){ die "The expected value for the evaluate parameter is 0 or 1!\n"} if ($force != 0 and $force != 1){ die "The expected value for the force parameter is 0 or 1!\n"} +if ($miu !~ /[0-9\.e\-]+/){ die "The expected value for the u parameter is float value without units!\n"} if ($debug != 0 and $debug != 1){ die "The expected value for the debug parameter is 0 or 1!\n"} if ($threads !~ /^[0-9]+$/){ die "The expected value for the threads parameter is an integer!\n"} @@ -394,7 +400,7 @@ print "$date\tObtain raw TE libraries using various structure-based programs: \n"; # Get raw TE candidates -`perl $EDTA_raw --genome $genome --overwrite $overwrite --species $species --threads $threads --genometools $genometools --ltrretriever $LTR_retriever --blastplus $blastplus --tesorter $TEsorter --GRF $GRF --trf_path $trf --repeatmasker $repeatmasker --convert_seq_name 0`; +`perl $EDTA_raw --genome $genome --overwrite $overwrite --species $species --u $miu --threads $threads --genometools $genometools --ltrretriever $LTR_retriever --blastplus $blastplus --tesorter $TEsorter --GRF $GRF --trf_path $trf --repeatmasker $repeatmasker --convert_seq_name 0`; chdir "$genome.EDTA.raw"; @@ -533,7 +539,7 @@ if (-s "$cds"){ my $rm_status = `${repeatmasker}RepeatMasker -e ncbi -pa $threads -q -no_is -norna -nolow -div 40 -cutoff 225 -lib $cds $genome.EDTA.raw.fa 2>/dev/null`; `cp $genome.EDTA.raw.fa $genome.EDTA.raw.fa.masked` if $rm_status =~ /No repetitive sequences were detected/i; - my $rm_status = `${repeatmasker}RepeatMasker -e ncbi -pa $threads -q -no_is -norna -nolow -div 40 -cutoff 225 -lib $cds $genome.EDTA.intact.fa.raw 2>/dev/null`; + $rm_status = `${repeatmasker}RepeatMasker -e ncbi -pa $threads -q -no_is -norna -nolow -div 40 -cutoff 225 -lib $cds $genome.EDTA.intact.fa.raw 2>/dev/null`; `cp $genome.EDTA.intact.fa.raw $genome.EDTA.intact.fa.raw.masked` if $rm_status =~ /No repetitive sequences were detected/i; `perl $cleanup_tandem -misschar N -Nscreen 1 -nc 1000 -nr 0.3 -minlen 80 -maxlen 5000000 -trf 0 -cleanN 1 -cleanT 1 -f $genome.EDTA.raw.fa.masked > $genome.EDTA.raw.fa.cln`; `perl $cleanup_tandem -misschar N -Nscreen 1 -nc 1000 -nr 0.8 -minlen 80 -maxlen 5000000 -trf 0 -cleanN 0 -f $genome.EDTA.intact.fa.raw.masked > $genome.EDTA.intact.fa.rmCDS`; diff --git a/EDTA_raw.pl b/EDTA_raw.pl index 8d42d6c..e50e918 100755 --- a/EDTA_raw.pl +++ b/EDTA_raw.pl @@ -31,6 +31,9 @@ --convert_seq_name [0|1] Convert long sequence name to <= 15 characters and remove annotations (1, default) or use the original (0) + --u [float] Neutral mutation rate to calculate the age of intact LTR elements. + Intact LTR age is found in this file: *EDTA_raw/LTR/*.pass.list. + Default: 1.3e-8 (per bp per year, from rice). --tesorter [path] Path to the TEsorter program. (default: find from ENV) --repeatmasker [path] Path to the RepeatMasker program. (default: find from ENV) --threads|-t [int] Number of theads to run this script. Default: 4 @@ -44,6 +47,7 @@ my $overwrite = 0; #0, no rerun. 1, rerun even old results exist. my $convert_name = 1; #0, use original seq names; 1 shorten names. my $maxint = 5000; #maximum interval length (bp) between TIRs (for GRF in TIR-Learner) +my $miu = 1.3e-8; #mutation rate, per bp per year, from rice my $threads = 4; my $script_path = $FindBin::Bin; my $LTR_FINDER = "$script_path/bin/LTR_FINDER_parallel/LTR_FINDER_parallel"; @@ -82,6 +86,7 @@ $type = lc $ARGV[$k+1] if /^--type$/i and $ARGV[$k+1] !~ /^-/; $overwrite = $ARGV[$k+1] if /^--overwrite$/i and $ARGV[$k+1] !~ /^-/; $convert_name = $ARGV[$k+1] if /^--convert_seq_name$/i and $ARGV[$k+1] !~ /^-/; + $miu = $ARGV[$k+1] if /^--u$/i and $ARGV[$k+1] !~ /^-/; $genometools = $ARGV[$k+1] if /^--genometools/i and $ARGV[$k+1] !~ /^-/; $repeatmasker = $ARGV[$k+1] if /^--repeatmasker$/i and $ARGV[$k+1] !~ /^-/; $LTR_retriever = $ARGV[$k+1] if /^--ltrretriever/i and $ARGV[$k+1] !~ /^-/; @@ -124,6 +129,7 @@ if ($overwrite != 0 and $overwrite != 1){ die "The expected value for the overwrite parameter is 0 or 1!\n"}; if ($convert_name != 0 and $convert_name != 1){ die "The expected value for the convert_seq_name parameter is 0 or 1!\n"}; if ($threads !~ /^[0-9]+$/){ die "The expected value for the threads parameter is an integer!\n"}; +if ($miu !~ /[0-9\.e\-]+/){ die "The expected value for the u parameter is float value without units!\n"} chomp (my $date = `date`); print STDERR "$date\tEDTA_raw: Check dependencies, prepare working directories.\n\n"; @@ -297,7 +303,7 @@ # run LTR_retriever `cat $genome.harvest.combine.scn $genome.finder.combine.scn > $genome.rawLTR.scn`; -`${LTR_retriever}LTR_retriever -genome $genome -inharvest $genome.rawLTR.scn -threads $threads -noanno -trf_path $trf -blastplus $blastplus -repeatmasker $repeatmasker`; +`${LTR_retriever}LTR_retriever -genome $genome -inharvest $genome.rawLTR.scn -u $miu -threads $threads -noanno -trf_path $trf -blastplus $blastplus -repeatmasker $repeatmasker`; # get full-length LTR from pass.list `awk '{if (\$1 !~ /#/) print \$1"\\t"\$1}' $genome.pass.list | perl $call_seq - -C $genome > $genome.LTR.intact.fa.ori`; diff --git a/README.md b/README.md index 0dccab3..278485b 100644 --- a/README.md +++ b/README.md @@ -176,25 +176,29 @@ Optional: *You got a genome and you want to get a high-quality TE annotation:* perl EDTA.pl [options] - --genome [File] The genome FASTA + --genome [File] The genome FASTA file. Required. --species [Rice|Maize|others] Specify the species for identification of TIR candidates. Default: others - --step [all|filter|final|anno] Specify which steps you want to run EDTA. - all: run the entire pipeline (default) - filter: start from raw TEs to the end. - final: start from filtered TEs to finalizing the run. - anno: perform whole-genome annotation/analysis after TE library construction. - --overwrite [0|1] If previous results are found, decide to overwrite (1, rerun) or not (0, default). - --cds [File] Provide a FASTA file containing the coding sequence (no introns, UTRs, nor TEs) of this genome or its close relative. - --curatedlib [file] Provided a curated library to keep consistant naming and classification for known TEs. + --step [all|filter|final|anno] Specify which steps you want to run EDTA. + all: run the entire pipeline (default) + filter: start from raw TEs to the end. + final: start from filtered TEs to finalizing the run. + anno: perform whole-genome annotation/analysis after TE library construction. + --overwrite [0|1] If previous results are found, decide to overwrite (1, rerun) or not (0, default). + --cds [File] Provide a FASTA file containing the coding sequence (no introns, UTRs, nor TEs) of this genome or its close relative. + --curatedlib [file] Provided a curated library to keep consistant naming and classification for known TEs. All TEs in this file will be trusted 100%, so please ONLY provide MANUALLY CURATED ones here. - This option is not mandatory. It's totally OK if no file is provided (default). - --sensitive [0|1] Use RepeatModeler to identify remaining TEs (1) or not (0, default). - This step is very slow and MAY help to recover some TEs. - --anno [0|1] Perform (1) or not perform (0, default) whole-genome TE annotation after TE library construction. - --rmout [File] Provide your own homology-based TE annotation instead of using the EDTA library for masking. File is in RepeatMasker .out format. This file will be merged with the structural-based TE annotation. (--anno 1 required). Default: use the EDTA library for annotation. - --evaluate [0|1] Evaluate (1) classification consistency of the TE annotation. (--anno 1 required). Default: 0. - This step is slow and does not affect the annotation result. + This option is not mandatory. It's totally OK if no file is provided (default). + --sensitive [0|1] Use RepeatModeler to identify remaining TEs (1) or not (0, default). + This step is very slow and MAY help to recover some TEs. + --anno [0|1] Perform (1) or not perform (0, default) whole-genome TE annotation after TE library construction. + --rmout [File] Provide your own homology-based TE annotation instead of using the EDTA library for masking. + File is in RepeatMasker .out format. This file will be merged with the structural-based TE annotation. (--anno 1 required). + Default: use the EDTA library for annotation. + --evaluate [0|1] Evaluate (1) classification consistency of the TE annotation. (--anno 1 required). Default: 0. + This step is slow and does not affect the annotation result. --exclude [File] Exclude bed format regions from TE annotation. Default: undef. (--anno 1 required). + --u [float] Neutral mutation rate to calculate the age of intact LTR elements. + Intact LTR age is found in this file: *EDTA_raw/LTR/*.pass.list. Default: 1.3e-8 (per bp per year, from rice). --threads|-t [int] Number of theads to run this script (default: 4) --help|-h Display this help info