feat: switch to featureCounts. It is much faster than htseq-count and…

… works with alignment sorted BAM files (even though internally it sorts by name to work) fix: no need for virtual environment in defaults as PBS can pass environment variables to jobs.
meono · Jul 28, 2016 · cd7d2ed · cd7d2ed
1 parent dd42a01
commit cd7d2ed
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 9 deletions.
diff --git a/iLoop_RNAseq_pipeline/master_qsub.py b/iLoop_RNAseq_pipeline/master_qsub.py
@@ -195,18 +195,12 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
     # it turnsout featureCounts does a name based sorting before operating. therefore this might be just as inefficient.
     if any(job for job in jobs if job in ['featureCounts', 'edgeR', 'DESeq']):
         logger.info('Using htseq options: {}'.format(defaults['htseq_options']))
-        jobstr += ['echo "featureCounts"\nfeatureCounts {} -a {} -g {} -o {} {}'.format(defaults['featureCounts_options'],
-                                                                                        (os.path.abspath(
-                                                                                            os.path.join(project_path,
-                                                                                                         sample,
-                                                                                                         'accepted_hits.sorted.bam'))),
+        jobstr += ['echo "featureCounts"\nfeatureCounts {} -a {} -o {} {}'.format(defaults['featureCounts_options'],
                                                                                         ((ref['gff_genome']) if ref.get(
                                                                                             'gff_genome') else ''),
-                                                                                        ((ref['fa_genome']) if ref.get(
-                                                                                            'fa_genome') else ''),
                                                                                         (os.path.abspath(os.path.join(project_path,
                                                                                                                       sample,
-                                                                                                                      'htseq_counts.out'))),
+                                                                                                                      'featureCounts_', sample, '.out'))),
                                                                                         (os.path.abspath(
                                                                                             os.path.join(project_path,
                                                                                                          sample,
@@ -410,7 +404,7 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
             return False
     # TODO: a threshold for mapping to mask/genome can be used to decide whether to go forward or stop pipeline. At least generate a warning.
 
-    mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count']
+    mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count', 'featureCounts']
     if (any(job for job in jobs if job in mljobs)) or (jobs == []):
         try:
             mapjobIDs = []

diff --git a/iLoop_RNAseq_pipeline/scripts/featureCounts_collector.py b/iLoop_RNAseq_pipeline/scripts/featureCounts_collector.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import json
+import os
+import argparse
+import logging
+
+logger = logging.getLogger(__name__)
+
+parser = argparse.ArgumentParser(description='This script will collect the counts generated by featureCounts. Outputs 2 files: one for read counts and one for stats.')
+
+parser.add_argument('-p', '--project-path', help='Path to project folder')
+parser.add_argument('-g', '--groups-json', help='json file for experiment setup with full path.')
+parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./htseq/collected_counts')
+args = parser.parse_args()
+
+groups = json.load(open(args.groups_json, 'r'))
+
+df = pd.concat([pd.io.parsers.read_table(os.path.join(os.path.abspath(args.project_path), sample, 'htseq_counts.out'), header=None, index_col=0, names=[sample]) for group in groups.values() for sample in group.keys()], axis=1)
+df_stats = df.tail(5).copy()
+df = df.drop(df.tail(5).index)
+df.to_csv('{}.csv'.format(args.output))
+df_stats.to_csv('{}_stats.csv'.format(args.output))