add the -u parameter to allow user specified mutation rate #271

oushujun · Jun 23, 2022 · 001834d · 001834d
1 parent c86d6db
commit 001834d
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 31 deletions.
diff --git a/EDTA.pl b/EDTA.pl
@@ -6,7 +6,7 @@
 use Getopt::Long;
 use Pod::Usage;
 
-my $version = "v2.0.0";
+my $version = "v2.0.1";
 #v1.0 05/31/2019
 #v1.1 06/05/2019
 #v1.2 06/16/2019
@@ -33,27 +33,27 @@
 structure-based TE library. Usage:
 
 perl EDTA.pl [options]
-	--genome	[File]	The genome FASTA
+	--genome [File]		The genome FASTA file. Required.
 	--species [Rice|Maize|others]	Specify the species for identification of TIR
 					candidates. Default: others
-	--step	[all|filter|final|anno] Specify which steps you want to run EDTA.
+	--step [all|filter|final|anno]	Specify which steps you want to run EDTA.
 					all: run the entire pipeline (default)
 					filter: start from raw TEs to the end.
 					final: start from filtered TEs to finalizing the run.
 					anno: perform whole-genome annotation/analysis after
 						TE library construction.
-	--overwrite	[0|1]	If previous raw TE results are found, decide to overwrite
+	--overwrite [0|1]	If previous raw TE results are found, decide to overwrite
 				(1, rerun) or not (0, default).
-	--cds	[File]	Provide a FASTA file containing the coding sequence (no introns,
+	--cds [File]	Provide a FASTA file containing the coding sequence (no introns,
 			UTRs, nor TEs) of this genome or its close relative.
-	--curatedlib	[File]	Provided a curated library to keep consistant naming and
+	--curatedlib [File]	Provided a curated library to keep consistant naming and
 				classification for known TEs. TEs in this file will be
 				trusted 100%, so please ONLY provide MANUALLY CURATED ones.
 				This option is not mandatory. It's totally OK if no file is
 				provided (default).
-	--sensitive	[0|1]	Use RepeatModeler to identify remaining TEs (1) or not (0,
+	--sensitive [0|1]	Use RepeatModeler to identify remaining TEs (1) or not (0,
 				default). This step is slow but MAY help to recover some TEs.
-	--anno	[0|1]	Perform (1) or not perform (0, default) whole-genome TE annotation
+	--anno [0|1]	Perform (1) or not perform (0, default) whole-genome TE annotation
 			after TE library construction.
 	--rmout	[File]	Provide your own homology-based TE annotation instead of using the
 			EDTA library for masking. File is in RepeatMasker .out format. This
@@ -62,16 +62,19 @@
 	--evaluate [0|1]	Evaluate (1) classification consistency of the TE annotation.
 				(--anno 1 required). Default: 0. This step is slow and does
 				not change the annotation result.
-	--exclude	[File]	Exclude bed format regions from TE annotation. Default: undef.
+	--exclude [File]	Exclude bed format regions from TE annotation. Default: undef.
 				(--anno 1 required).
 	--force	[0|1]	When no confident TE candidates are found: 0, interrupt and exit
 			(default); 1, use rice TEs to continue.
+	--u [float]	Neutral mutation rate to calculate the age of intact LTR elements.
+			Intact LTR age is found in this file: *EDTA_raw/LTR/*.pass.list.
+			Default: 1.3e-8 (per bp per year, from rice).
 	--repeatmodeler [path]	The directory containing RepeatModeler (default: read from ENV)
 	--repeatmasker [path]	The directory containing RepeatMasker (default: read from ENV)
 	--check_dependencies Check if dependencies are fullfiled and quit
-	--threads|-t	[int]	Number of theads to run this script (default: 4)
-	--debug		[0|1]	Retain intermediate files (default: 0)
-	--help|-h	Display this help info
+	--threads|-t [int]	Number of theads to run this script (default: 4)
+	--debug	 [0|1]	Retain intermediate files (default: 0)
+	--help|-h 	Display this help info
 \n";
 
 # pre-defined
@@ -88,6 +91,7 @@
 my $evaluate = 0; #1 will evaluate the consistancy of the TE annotation
 my $exclude = ''; #a bed file exclude from TE annotation
 my $force = 0; #if there is no confident TE found in EDTA_raw, 1 will use rice TEs as raw lib, 0 will error and interrupt.
+my $miu = 1.3e-8; #mutation rate, per bp per year, from rice
 my $threads = 4;
 my $script_path = $FindBin::Bin;
 my $EDTA_raw = "$script_path/EDTA_raw.pl";
@@ -150,6 +154,7 @@
 		  'evaluate=i'           => \$evaluate,
 		  'exclude=s'            => \$exclude,
 		  'force=i'              => \$force,
+		  'u=s'                  => \$miu,
 		  'repeatmodeler=s'      => \$repeatmodeler,
 		  'repeatmasker=s'       => \$repeatmasker,
 		  'tesorter=s'           => \$TEsorter,
@@ -185,6 +190,7 @@
 if ($anno != 0 and $anno != 1){ die "The expected value for the anno parameter is 0 or 1!\n"}
 if ($evaluate != 0 and $evaluate != 1){ die "The expected value for the evaluate parameter is 0 or 1!\n"}
 if ($force != 0 and $force != 1){ die "The expected value for the force parameter is 0 or 1!\n"}
+if ($miu !~ /[0-9\.e\-]+/){ die "The expected value for the u parameter is float value without units!\n"}
 if ($debug != 0 and $debug != 1){ die "The expected value for the debug parameter is 0 or 1!\n"}
 if ($threads !~ /^[0-9]+$/){ die "The expected value for the threads parameter is an integer!\n"}
 
@@ -394,7 +400,7 @@
 print "$date\tObtain raw TE libraries using various structure-based programs: \n";
 
 # Get raw TE candidates
-`perl $EDTA_raw --genome $genome --overwrite $overwrite --species $species --threads $threads --genometools $genometools --ltrretriever $LTR_retriever --blastplus $blastplus --tesorter $TEsorter --GRF $GRF --trf_path $trf --repeatmasker $repeatmasker --convert_seq_name 0`;
+`perl $EDTA_raw --genome $genome --overwrite $overwrite --species $species --u $miu --threads $threads --genometools $genometools --ltrretriever $LTR_retriever --blastplus $blastplus --tesorter $TEsorter --GRF $GRF --trf_path $trf --repeatmasker $repeatmasker --convert_seq_name 0`;
 
 chdir "$genome.EDTA.raw";
 
@@ -533,7 +539,7 @@
 	if (-s "$cds"){
 		my $rm_status = `${repeatmasker}RepeatMasker -e ncbi -pa $threads -q -no_is -norna -nolow -div 40 -cutoff 225 -lib $cds $genome.EDTA.raw.fa 2>/dev/null`;
 		`cp $genome.EDTA.raw.fa $genome.EDTA.raw.fa.masked` if $rm_status =~ /No repetitive sequences were detected/i;
-		my $rm_status = `${repeatmasker}RepeatMasker -e ncbi -pa $threads -q -no_is -norna -nolow -div 40 -cutoff 225 -lib $cds $genome.EDTA.intact.fa.raw 2>/dev/null`;
+		$rm_status = `${repeatmasker}RepeatMasker -e ncbi -pa $threads -q -no_is -norna -nolow -div 40 -cutoff 225 -lib $cds $genome.EDTA.intact.fa.raw 2>/dev/null`;
 		`cp $genome.EDTA.intact.fa.raw $genome.EDTA.intact.fa.raw.masked` if $rm_status =~ /No repetitive sequences were detected/i;
 		`perl $cleanup_tandem -misschar N -Nscreen 1 -nc 1000 -nr 0.3 -minlen 80 -maxlen 5000000 -trf 0 -cleanN 1 -cleanT 1 -f $genome.EDTA.raw.fa.masked > $genome.EDTA.raw.fa.cln`;
 		`perl $cleanup_tandem -misschar N -Nscreen 1 -nc 1000 -nr 0.8 -minlen 80 -maxlen 5000000 -trf 0 -cleanN 0 -f $genome.EDTA.intact.fa.raw.masked > $genome.EDTA.intact.fa.rmCDS`;

diff --git a/EDTA_raw.pl b/EDTA_raw.pl
@@ -31,6 +31,9 @@
 	--convert_seq_name	[0|1]	Convert long sequence name to <= 15
 					characters and remove annotations (1,
 					default) or use the original (0)
+	--u [float]	Neutral mutation rate to calculate the age of intact LTR elements.
+			Intact LTR age is found in this file: *EDTA_raw/LTR/*.pass.list.
+			Default: 1.3e-8 (per bp per year, from rice).
 	--tesorter	[path]	Path to the TEsorter program. (default: find from ENV)
 	--repeatmasker	[path]	Path to the RepeatMasker program. (default: find from ENV)
 	--threads|-t	[int]	Number of theads to run this script. Default: 4
@@ -44,6 +47,7 @@
 my $overwrite = 0; #0, no rerun. 1, rerun even old results exist.
 my $convert_name = 1; #0, use original seq names; 1 shorten names.
 my $maxint = 5000; #maximum interval length (bp) between TIRs (for GRF in TIR-Learner)
+my $miu = 1.3e-8; #mutation rate, per bp per year, from rice
 my $threads = 4;
 my $script_path = $FindBin::Bin;
 my $LTR_FINDER = "$script_path/bin/LTR_FINDER_parallel/LTR_FINDER_parallel";
@@ -82,6 +86,7 @@
 	$type = lc $ARGV[$k+1] if /^--type$/i and $ARGV[$k+1] !~ /^-/;
 	$overwrite = $ARGV[$k+1] if /^--overwrite$/i and $ARGV[$k+1] !~ /^-/;
 	$convert_name = $ARGV[$k+1] if /^--convert_seq_name$/i and $ARGV[$k+1] !~ /^-/;
+	$miu = $ARGV[$k+1] if /^--u$/i and $ARGV[$k+1] !~ /^-/;
 	$genometools = $ARGV[$k+1] if /^--genometools/i and $ARGV[$k+1] !~ /^-/;
 	$repeatmasker = $ARGV[$k+1] if /^--repeatmasker$/i and $ARGV[$k+1] !~ /^-/;
 	$LTR_retriever = $ARGV[$k+1] if /^--ltrretriever/i and $ARGV[$k+1] !~ /^-/;
@@ -124,6 +129,7 @@
 if ($overwrite != 0 and $overwrite != 1){ die "The expected value for the overwrite parameter is 0 or 1!\n"};
 if ($convert_name != 0 and $convert_name != 1){ die "The expected value for the convert_seq_name parameter is 0 or 1!\n"};
 if ($threads !~ /^[0-9]+$/){ die "The expected value for the threads parameter is an integer!\n"};
+if ($miu !~ /[0-9\.e\-]+/){ die "The expected value for the u parameter is float value without units!\n"}
 
 chomp (my $date = `date`);
 print STDERR "$date\tEDTA_raw: Check dependencies, prepare working directories.\n\n";
@@ -297,7 +303,7 @@
 
 # run LTR_retriever
 `cat $genome.harvest.combine.scn $genome.finder.combine.scn > $genome.rawLTR.scn`;
-`${LTR_retriever}LTR_retriever -genome $genome -inharvest $genome.rawLTR.scn -threads $threads -noanno -trf_path $trf -blastplus $blastplus -repeatmasker $repeatmasker`;
+`${LTR_retriever}LTR_retriever -genome $genome -inharvest $genome.rawLTR.scn -u $miu -threads $threads -noanno -trf_path $trf -blastplus $blastplus -repeatmasker $repeatmasker`;
 
 # get full-length LTR from pass.list
 `awk '{if (\$1 !~ /#/) print \$1"\\t"\$1}' $genome.pass.list | perl $call_seq - -C $genome > $genome.LTR.intact.fa.ori`;

diff --git a/README.md b/README.md
@@ -176,25 +176,29 @@ Optional:
 *You got a genome and you want to get a high-quality TE annotation:*
 
     perl EDTA.pl [options]
-      --genome	[File]	The genome FASTA
+      --genome [File]		The genome FASTA file. Required.
       --species [Rice|Maize|others]	Specify the species for identification of TIR candidates. Default: others
-      --step	[all|filter|final|anno] Specify which steps you want to run EDTA.
-				all: run the entire pipeline (default)
-				filter: start from raw TEs to the end.
-				final: start from filtered TEs to finalizing the run.
-				anno: perform whole-genome annotation/analysis after TE library construction.
-      --overwrite	[0|1]	If previous results are found, decide to overwrite (1, rerun) or not (0, default).
-      --cds	[File]	Provide a FASTA file containing the coding sequence (no introns, UTRs, nor TEs) of this genome or its close relative.
-      --curatedlib	[file]	Provided a curated library to keep consistant naming and classification for known TEs.
+      --step [all|filter|final|anno]	Specify which steps you want to run EDTA.
+					 all: run the entire pipeline (default)
+					 filter: start from raw TEs to the end.
+					 final: start from filtered TEs to finalizing the run.
+					 anno: perform whole-genome annotation/analysis after TE library construction.
+      --overwrite [0|1]		If previous results are found, decide to overwrite (1, rerun) or not (0, default).
+      --cds [File]		Provide a FASTA file containing the coding sequence (no introns, UTRs, nor TEs) of this genome or its close relative.
+      --curatedlib [file]	Provided a curated library to keep consistant naming and classification for known TEs.
 				All TEs in this file will be trusted 100%, so please ONLY provide MANUALLY CURATED ones here.
-				This option is not mandatory. It's totally OK if no file is provided (default).
-      --sensitive	[0|1]	Use RepeatModeler to identify remaining TEs (1) or not (0, default).
-				This step is very slow and MAY help to recover some TEs.
-      --anno	[0|1]	Perform (1) or not perform (0, default) whole-genome TE annotation after TE library construction.
-      --rmout	[File]	Provide your own homology-based TE annotation instead of using the EDTA library for masking. File is in RepeatMasker .out format. This file will be merged with the structural-based TE annotation. (--anno 1 required). Default: use the EDTA library for annotation.
-      --evaluate	[0|1]	Evaluate (1) classification consistency of the TE annotation. (--anno 1 required). Default: 0.
-				This step is slow and does not affect the annotation result.
+				 This option is not mandatory. It's totally OK if no file is provided (default).
+      --sensitive [0|1]		Use RepeatModeler to identify remaining TEs (1) or not (0, default).
+				 This step is very slow and MAY help to recover some TEs.
+      --anno [0|1]	Perform (1) or not perform (0, default) whole-genome TE annotation after TE library construction.
+      --rmout [File]	Provide your own homology-based TE annotation instead of using the EDTA library for masking.
+			File is in RepeatMasker .out format. This file will be merged with the structural-based TE annotation. (--anno 1 required).
+			Default: use the EDTA library for annotation.
+      --evaluate [0|1]	Evaluate (1) classification consistency of the TE annotation. (--anno 1 required). Default: 0.
+			 This step is slow and does not affect the annotation result.
       --exclude	[File]	Exclude bed format regions from TE annotation. Default: undef. (--anno 1 required).
+      --u [float]	Neutral mutation rate to calculate the age of intact LTR elements.
+			 Intact LTR age is found in this file: *EDTA_raw/LTR/*.pass.list. Default: 1.3e-8 (per bp per year, from rice).
       --threads|-t	[int]	Number of theads to run this script (default: 4)
       --help|-h	Display this help info