Skip to content

Commit

Permalink
feat: switch to featureCounts. It is much faster than htseq-count and…
Browse files Browse the repository at this point in the history
… works with alignment sorted BAM files (even though internally it sorts by name to work)

fix: no need for virtual environment in defaults as PBS can pass environment variables to jobs.
  • Loading branch information
meono committed Jul 28, 2016
1 parent dd42a01 commit cd7d2ed
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 9 deletions.
12 changes: 3 additions & 9 deletions iLoop_RNAseq_pipeline/master_qsub.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,18 +195,12 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
# it turnsout featureCounts does a name based sorting before operating. therefore this might be just as inefficient.
if any(job for job in jobs if job in ['featureCounts', 'edgeR', 'DESeq']):
logger.info('Using htseq options: {}'.format(defaults['htseq_options']))
jobstr += ['echo "featureCounts"\nfeatureCounts {} -a {} -g {} -o {} {}'.format(defaults['featureCounts_options'],
(os.path.abspath(
os.path.join(project_path,
sample,
'accepted_hits.sorted.bam'))),
jobstr += ['echo "featureCounts"\nfeatureCounts {} -a {} -o {} {}'.format(defaults['featureCounts_options'],
((ref['gff_genome']) if ref.get(
'gff_genome') else ''),
((ref['fa_genome']) if ref.get(
'fa_genome') else ''),
(os.path.abspath(os.path.join(project_path,
sample,
'htseq_counts.out'))),
'featureCounts_', sample, '.out'))),
(os.path.abspath(
os.path.join(project_path,
sample,
Expand Down Expand Up @@ -410,7 +404,7 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
return False
# TODO: a threshold for mapping to mask/genome can be used to decide whether to go forward or stop pipeline. At least generate a warning.

mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count']
mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count', 'featureCounts']
if (any(job for job in jobs if job in mljobs)) or (jobs == []):
try:
mapjobIDs = []
Expand Down
24 changes: 24 additions & 0 deletions iLoop_RNAseq_pipeline/scripts/featureCounts_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python

import pandas as pd
import json
import os
import argparse
import logging

logger = logging.getLogger(__name__)

parser = argparse.ArgumentParser(description='This script will collect the counts generated by featureCounts. Outputs 2 files: one for read counts and one for stats.')

parser.add_argument('-p', '--project-path', help='Path to project folder')
parser.add_argument('-g', '--groups-json', help='json file for experiment setup with full path.')
parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./htseq/collected_counts')
args = parser.parse_args()

groups = json.load(open(args.groups_json, 'r'))

df = pd.concat([pd.io.parsers.read_table(os.path.join(os.path.abspath(args.project_path), sample, 'htseq_counts.out'), header=None, index_col=0, names=[sample]) for group in groups.values() for sample in group.keys()], axis=1)
df_stats = df.tail(5).copy()
df = df.drop(df.tail(5).index)
df.to_csv('{}.csv'.format(args.output))
df_stats.to_csv('{}_stats.csv'.format(args.output))

0 comments on commit cd7d2ed

Please sign in to comment.