Skip to content

Commit

Permalink
feat: switch to featureCounts. It is much faster than htseq-count and…
Browse files Browse the repository at this point in the history
… works with alignment sorted BAM files (even though internally it sorts by name to work)

fix: no need for virtual environment in defaults as PBS can pass environment variables to jobs.
  • Loading branch information
meono committed Jul 28, 2016
1 parent b216225 commit dd42a01
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 39 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ Optional: An environmental variable "RNAseq_env" for the virtual environment nam

##Requirements:
HISAT2 and htseq-count depends on local installations due to version issues. The rest of the packages are used through Computerome module system.
A python virtual environment to enable running scripts under job. Must be set up to work with virtualenvwrapper.
A local installation of featureCounts since the module version is old.
3 changes: 2 additions & 1 deletion iLoop_RNAseq_pipeline/defaults/RNAseq_pipeline_defaults.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ cufflinks_options,--library-type fr-firststrand --max-bundle-frags 25000000
cuffmerge_options,
cuffdiff_options,--library-type fr-firststrand --upper-quartile-norm
cuffquant_options,--library-type fr-firststrand --max-bundle-frags 25000000
htseq_options,-q --type=gene --idattr=gene_name
htseq_options,-q --type=gene --idattr=gene_name --stranded=reverse
featureCounts_options,-s 2 -t gene -g gene_name
17 changes: 0 additions & 17 deletions iLoop_RNAseq_pipeline/initiate_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,6 @@ def get_defaults():
f.write('\nproject,{}'.format(project))
f.close()

while True:
# TODO: ideally this should check whether the given virtual environment has the package installed.
# try:
# import iLoop_RNAseq_pipeline
# break
# except ImportError:
# logger.warning('iLoop_RNAseq_pipeline is not available.')
# pass

if (('RNAseq_env' not in defaults) or (defaults['RNAseq_env'] == 'python_environment_for_RNAseq')):
if os.getenv('RNAseq_env') is not None:
defaults['RNAseq_env'] = os.getenv('RNAseq_env')
break
else:
defaults['RNAseq_env'] = input('Enter python environment for RNAseq where iLoop_RNAseq_pipeline is installed.')
break

return defaults


Expand Down
68 changes: 48 additions & 20 deletions iLoop_RNAseq_pipeline/master_qsub.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def fastqc_job(project_path, groups, output_path, defaults, ppn='8', walltime='0


def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', walltime='12:00:00', map_to_mask=False):
mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count']
mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count', 'featureCounts']
if map_to_mask:
jobs = ['hisat2_to_mask']
elif (jobs == []) and (map_to_mask == False):
Expand All @@ -111,10 +111,10 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
export PATH''']

jobstr += ['export hisat2_genomic_indexes={}'.format(ref['hisat2_genomic_indexes'])]
if map_to_mask:
jobstr += ['mkdir -p {}/{}/{}'.format(project_path, 'map_to_mask', sample)]
else:
jobstr += ['mkdir {}/{}'.format(project_path, sample)]
if (map_to_mask) and (not os.path.exists(os.path.join(project_path, 'map_to_mask', sample))):
jobstr += ['mkdir -p {}'.format(os.path.abspath(os.path.join(project_path, 'map_to_mask', sample)))]
elif not os.path.exists(os.path.join(project_path, sample)):
jobstr += ['mkdir {}'.format(os.path.abspath(os.path.join(project_path, sample)))]

R1reads = [read for read in reads if read[-15:-13] == 'R1']
R1reads.sort()
Expand Down Expand Up @@ -176,20 +176,41 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
os.path.join(project_path, sample,
'accepted_hits.sorted.bam'))))]

if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq']):
# htseq-count doesn't work well with positional sorted bam files. Switch to featureCounts from subread package
# if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq']):
# logger.info('Using htseq options: {}'.format(defaults['htseq_options']))
# jobstr += ['echo "htseq"\nhtseq-count {} -f bam -r pos {} {} -o {} > {}'.format(defaults['htseq_options'],
# (os.path.abspath(
# os.path.join(project_path, sample,
# 'accepted_hits.sorted.bam'))),
# ((ref['gff_genome']) if ref.get(
# 'gff_genome') else ''),
# (os.path.abspath(
# os.path.join(project_path, sample,
# 'htseq_counts.sam'))),
# (os.path.abspath(
# os.path.join(project_path, sample,
# 'htseq_counts.out'))))]

# it turnsout featureCounts does a name based sorting before operating. therefore this might be just as inefficient.
if any(job for job in jobs if job in ['featureCounts', 'edgeR', 'DESeq']):
logger.info('Using htseq options: {}'.format(defaults['htseq_options']))
jobstr += ['echo "htseq"\nhtseq-count {} -f bam -r pos {} {} -o {} > {}'.format(defaults['htseq_options'],
(os.path.abspath(
os.path.join(project_path, sample,
'accepted_hits.sorted.bam'))),
((ref['gff_genome']) if ref.get(
'gff_genome') else ''),
(os.path.abspath(
os.path.join(project_path, sample,
'htseq_counts.sam'))),
(os.path.abspath(
os.path.join(project_path, sample,
'htseq_counts.out'))))]
jobstr += ['echo "featureCounts"\nfeatureCounts {} -a {} -g {} -o {} {}'.format(defaults['featureCounts_options'],
(os.path.abspath(
os.path.join(project_path,
sample,
'accepted_hits.sorted.bam'))),
((ref['gff_genome']) if ref.get(
'gff_genome') else ''),
((ref['fa_genome']) if ref.get(
'fa_genome') else ''),
(os.path.abspath(os.path.join(project_path,
sample,
'htseq_counts.out'))),
(os.path.abspath(
os.path.join(project_path,
sample,
'htseq_counts.sam'))))]

return '\n\n'.join(jobstr).replace('PPN', ppn)

Expand All @@ -208,8 +229,15 @@ def collect_counts_job(project_path, output, mapjobIDs, defaults, ppn='1', wallt
# TODO: clear out virtual environment arguments if this works
jobstr += ['#PBS -V']

# this is for htseq-count, which won't be used. For now, featureCounts will be used.
# jobstr += ['python {}/htseq_count_collector.py -p {} -g {} -o {} '.format(os.path.abspath(os.path.join(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
# os.path.abspath(project_path),
# os.path.abspath(os.path.join(os.path.join(project_path, 'groups.json'))),
# output)]

# line for featureCounts
jobstr += ['python {}/htseq_count_collector.py -p {} -g {} -o {} '.format(os.path.abspath(os.path.join(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
project_path,
os.path.abspath(project_path),
os.path.abspath(os.path.join(os.path.join(project_path, 'groups.json'))),
output)]

Expand Down Expand Up @@ -407,7 +435,7 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
# collect htseq counts
if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq', 'htseq-count-collect']) or (jobs == []):
try:
js = collect_counts_job(project_path=project_path, output=results_path, mapjobIDs=mapjobIDs, defaults=defaults)
js = collect_counts_job(project_path=project_path, output=os.path.abspath(os.path.join(results_path, 'htseq_counts_collected')), mapjobIDs=mapjobIDs, defaults=defaults)
collectjobID = job_submitter(js=js, path=job_files_path, name='job_htseq_count_collector.sh')
except Exception as ex:
logger.error(
Expand Down

0 comments on commit dd42a01

Please sign in to comment.