feat: switch to featureCounts. It is much faster than htseq-count and…

… works with alignment sorted BAM files (even though internally it sorts by name to work) fix: no need for virtual environment in defaults as PBS can pass environment variables to jobs.
meono · Jul 28, 2016 · dd42a01 · dd42a01
1 parent b216225
commit dd42a01
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -27,4 +27,4 @@ Optional: An environmental variable "RNAseq_env" for the virtual environment nam
 
 ##Requirements:
 HISAT2 and htseq-count depends on local installations due to version issues. The rest of the packages are used through Computerome module system.
-A python virtual environment to enable running scripts under job. Must be set up to work with virtualenvwrapper.
+A local installation of featureCounts since the module version is old. 
diff --git a/iLoop_RNAseq_pipeline/defaults/RNAseq_pipeline_defaults.txt b/iLoop_RNAseq_pipeline/defaults/RNAseq_pipeline_defaults.txt
@@ -9,4 +9,5 @@ cufflinks_options,--library-type fr-firststrand --max-bundle-frags 25000000
 cuffmerge_options,
 cuffdiff_options,--library-type fr-firststrand --upper-quartile-norm
 cuffquant_options,--library-type fr-firststrand --max-bundle-frags 25000000
-htseq_options,-q --type=gene --idattr=gene_name
+htseq_options,-q --type=gene --idattr=gene_name --stranded=reverse
+featureCounts_options,-s 2 -t gene -g gene_name
diff --git a/iLoop_RNAseq_pipeline/initiate_project.py b/iLoop_RNAseq_pipeline/initiate_project.py
@@ -67,23 +67,6 @@ def get_defaults():
         f.write('\nproject,{}'.format(project))
         f.close()
 
-    while True:
-        # TODO: ideally this should check whether the given virtual environment has the package installed.
-        # try:
-        #     import iLoop_RNAseq_pipeline
-        #     break
-        # except ImportError:
-        #     logger.warning('iLoop_RNAseq_pipeline is not available.')
-        #     pass
-
-        if (('RNAseq_env' not in defaults) or (defaults['RNAseq_env'] == 'python_environment_for_RNAseq')):
-            if os.getenv('RNAseq_env') is not None:
-                defaults['RNAseq_env'] = os.getenv('RNAseq_env')
-                break
-            else:
-                defaults['RNAseq_env'] = input('Enter python environment for RNAseq where iLoop_RNAseq_pipeline is installed.')
-                break
-
     return defaults
 
 

diff --git a/iLoop_RNAseq_pipeline/master_qsub.py b/iLoop_RNAseq_pipeline/master_qsub.py
@@ -90,7 +90,7 @@ def fastqc_job(project_path, groups, output_path, defaults, ppn='8', walltime='0
 
 
 def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', walltime='12:00:00', map_to_mask=False):
-    mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count']
+    mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count', 'featureCounts']
     if map_to_mask:
         jobs = ['hisat2_to_mask']
     elif (jobs == []) and (map_to_mask == False):
@@ -111,10 +111,10 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
 export PATH''']
 
     jobstr += ['export hisat2_genomic_indexes={}'.format(ref['hisat2_genomic_indexes'])]
-    if map_to_mask:
-        jobstr += ['mkdir -p {}/{}/{}'.format(project_path, 'map_to_mask', sample)]
-    else:
-        jobstr += ['mkdir {}/{}'.format(project_path, sample)]
+    if (map_to_mask) and (not os.path.exists(os.path.join(project_path, 'map_to_mask', sample))):
+        jobstr += ['mkdir -p {}'.format(os.path.abspath(os.path.join(project_path, 'map_to_mask', sample)))]
+    elif not os.path.exists(os.path.join(project_path, sample)):
+        jobstr += ['mkdir {}'.format(os.path.abspath(os.path.join(project_path, sample)))]
 
     R1reads = [read for read in reads if read[-15:-13] == 'R1']
     R1reads.sort()
@@ -176,20 +176,41 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
                                                                                      os.path.join(project_path, sample,
                                                                                                   'accepted_hits.sorted.bam'))))]
 
-    if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq']):
+    # htseq-count doesn't work well with positional sorted bam files. Switch to featureCounts from subread package
+    # if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq']):
+    #     logger.info('Using htseq options: {}'.format(defaults['htseq_options']))
+    #     jobstr += ['echo "htseq"\nhtseq-count {} -f bam -r pos {} {} -o {} > {}'.format(defaults['htseq_options'],
+    #                                                                              (os.path.abspath(
+    #                                                                                  os.path.join(project_path, sample,
+    #                                                                                               'accepted_hits.sorted.bam'))),
+    #                                                                              ((ref['gff_genome']) if ref.get(
+    #                                                                                  'gff_genome') else ''),
+    #                                                                              (os.path.abspath(
+    #                                                                                  os.path.join(project_path, sample,
+    #                                                                                               'htseq_counts.sam'))),
+    #                                                                              (os.path.abspath(
+    #                                                                                  os.path.join(project_path, sample,
+    #                                                                                               'htseq_counts.out'))))]
+
+    # it turnsout featureCounts does a name based sorting before operating. therefore this might be just as inefficient.
+    if any(job for job in jobs if job in ['featureCounts', 'edgeR', 'DESeq']):
         logger.info('Using htseq options: {}'.format(defaults['htseq_options']))
-        jobstr += ['echo "htseq"\nhtseq-count {} -f bam -r pos {} {} -o {} > {}'.format(defaults['htseq_options'],
-                                                                                 (os.path.abspath(
-                                                                                     os.path.join(project_path, sample,
-                                                                                                  'accepted_hits.sorted.bam'))),
-                                                                                 ((ref['gff_genome']) if ref.get(
-                                                                                     'gff_genome') else ''),
-                                                                                 (os.path.abspath(
-                                                                                     os.path.join(project_path, sample,
-                                                                                                  'htseq_counts.sam'))),
-                                                                                 (os.path.abspath(
-                                                                                     os.path.join(project_path, sample,
-                                                                                                  'htseq_counts.out'))))]
+        jobstr += ['echo "featureCounts"\nfeatureCounts {} -a {} -g {} -o {} {}'.format(defaults['featureCounts_options'],
+                                                                                        (os.path.abspath(
+                                                                                            os.path.join(project_path,
+                                                                                                         sample,
+                                                                                                         'accepted_hits.sorted.bam'))),
+                                                                                        ((ref['gff_genome']) if ref.get(
+                                                                                            'gff_genome') else ''),
+                                                                                        ((ref['fa_genome']) if ref.get(
+                                                                                            'fa_genome') else ''),
+                                                                                        (os.path.abspath(os.path.join(project_path,
+                                                                                                                      sample,
+                                                                                                                      'htseq_counts.out'))),
+                                                                                        (os.path.abspath(
+                                                                                            os.path.join(project_path,
+                                                                                                         sample,
+                                                                                                         'htseq_counts.sam'))))]
 
     return '\n\n'.join(jobstr).replace('PPN', ppn)
 
@@ -208,8 +229,15 @@ def collect_counts_job(project_path, output, mapjobIDs, defaults, ppn='1', wallt
     # TODO: clear out virtual environment arguments if this works
     jobstr += ['#PBS -V']
 
+    # this is for htseq-count, which won't be used. For now, featureCounts will be used.
+    # jobstr += ['python {}/htseq_count_collector.py -p {} -g {} -o {} '.format(os.path.abspath(os.path.join(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
+    #                                                                           os.path.abspath(project_path),
+    #                                                                           os.path.abspath(os.path.join(os.path.join(project_path, 'groups.json'))),
+    #                                                                           output)]
+
+    # line for featureCounts
     jobstr += ['python {}/htseq_count_collector.py -p {} -g {} -o {} '.format(os.path.abspath(os.path.join(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
-                                                                              project_path,
+                                                                              os.path.abspath(project_path),
                                                                               os.path.abspath(os.path.join(os.path.join(project_path, 'groups.json'))),
                                                                               output)]
 
@@ -407,7 +435,7 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
     # collect htseq counts
     if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq', 'htseq-count-collect']) or (jobs == []):
         try:
-            js = collect_counts_job(project_path=project_path, output=results_path, mapjobIDs=mapjobIDs, defaults=defaults)
+            js = collect_counts_job(project_path=project_path, output=os.path.abspath(os.path.join(results_path, 'htseq_counts_collected')), mapjobIDs=mapjobIDs, defaults=defaults)
             collectjobID = job_submitter(js=js, path=job_files_path, name='job_htseq_count_collector.sh')
         except Exception as ex:
             logger.error(