Skip to content

Commit

Permalink
feat: collect align summaries
Browse files Browse the repository at this point in the history
  • Loading branch information
meono committed Sep 22, 2016
1 parent d393297 commit 01043f3
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 4 deletions.
49 changes: 46 additions & 3 deletions iLoop_RNAseq_pipeline/master_qsub.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,29 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
return '\n\n'.join(jobstr).replace('PPN', ppn)


def collect_stats_job(project_path, output, mapjobIDs, defaults, ppn='1', walltime="00:05:00", map_to_mask=False):
jobstr = []
jobstr += [job_header.replace('JOBNAME', ('collect_stats' if not map_to_mask else 'collect_stats_mask'))
.replace('WALLTIME', walltime)
.replace('PROJECT', defaults['project'])
.replace('DEPEND', (
'afterok:{}'.format(':'.join([mapjob for mapjob in mapjobIDs])) if mapjobIDs != [''] else ''))
.replace('JOB_OUTPUTS', abspath(join_path(project_path, 'job_outputs')))
.replace('EMAILADDRESS', defaults['email'])]

# Pass all environmental variables to the job - this should take care of the virtual environment issue
# TODO: clear out virtual environment arguments if this works
jobstr += ['#PBS -V']

jobstr += ['python {}/collect_align_summaries.py -p {} -g {} -o {} {}'.format(abspath(join_path(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
abspath(project_path),
abspath(join_path(project_path, 'inputs', 'groups.json')),
output,
('-m' if not map_to_mask else ''))]

return '\n\n'.join(jobstr).replace('PPN', ppn)


def collect_counts_job(project_path, output, mapjobIDs, defaults, ppn='1', walltime="00:05:00"):
jobstr = []
jobstr += [job_header.replace('JOBNAME', 'collect_counts')
Expand All @@ -227,13 +250,13 @@ def collect_counts_job(project_path, output, mapjobIDs, defaults, ppn='1', wallt
# this is for htseq-count, which won't be used. For now, featureCounts will be used.
# jobstr += ['python {}/htseq_count_collector.py -p {} -g {} -o {} '.format(abspath(join_path(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
# abspath(project_path),
# abspath(join_path(join_path(project_path, 'groups.json'))),
# abspath(join_path(project_path, 'groups.json')),
# output)]

# line for featureCounts
jobstr += ['python {}/featureCounts_collector.py -p {} -g {} -o {} '.format(abspath(join_path(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
abspath(project_path),
abspath(join_path(join_path(project_path, 'inputs', 'groups.json'))),
abspath(join_path(project_path, 'inputs', 'groups.json')),
output)]

return '\n\n'.join(jobstr).replace('PPN', ppn)
Expand Down Expand Up @@ -444,16 +467,19 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
# generate and submit map and link jobs
if map_to_mask:
try:
maskjobIDs = []
for group_name, group in groups.items():
for sample, reads in group.items():
js = mapandlink_jobs(project_path=project_path, sample=sample, reads=reads, ref=ref,
defaults=defaults, ppn=ppn, jobs=jobs, map_to_mask=map_to_mask)
job_submitter(js, job_files_path, 'job_{}_mapandlink_to_mask.sh'.format(sample))
maskjobIDs.append(job_submitter(js, job_files_path, 'job_{}_mapandlink_to_mask.sh'.format(sample)))
except Exception as ex:
logger.error(
'Problem with map and link. RNAseq analysis is stopped.\nAn exception of type {} occured. Arguments:\n{}'.format(
type(ex).__name__, ex.args))
return False
else:
maskjobIDs = ['']
# TODO: a threshold for mapping to mask/genome can be used to decide whether to go forward or stop pipeline. At least generate a warning.

mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count', 'featureCounts']
Expand All @@ -478,6 +504,7 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
else:
mapjobIDs = ['']


# collect htseq counts - Ignore this since featureCounts is prefered now
# if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq', 'htseq-count-collect']) or (jobs == []):
# try:
Expand All @@ -499,6 +526,22 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
type(ex).__name__, ex.args))
return False

if any(job for job in jobs if job in ['collect_stats', 'hisat2', 'stringtie', 'cufflinks']) or (jobs == []):
statjobID = []
try:
if map_to_mask:
js = collect_stats_job(project_path=project_path, output=abspath(join_path(results_path, 'align_summaries_mask.tsv')), mapjobIDs=maskjobIDs, defaults=defaults, map_to_mask=map_to_mask)
collectjobID.append(job_submitter(js=js, path=job_files_path, name='job_collect_stats_mask.sh'))
js = collect_stats_job(project_path=project_path, output=abspath(join_path(results_path, 'align_summaries.tsv')), mapjobIDs=maskjobIDs, defaults=defaults, map_to_mask=False)
statjobID.append(job_submitter(js=js, path=job_files_path, name='job_collect_stats.sh'))
except Exception as ex:
logger.error(
'Problem with collect stats script. RNAseq analysis is stopped.\nAn exception of type {} occured. Arguments:\n{}'.format(
type(ex).__name__, ex.args))
return False
else:
statjobID = ['']

if 'edgeR' in jobs:
try:
js = edgeR_job(project_path=project_path, groups=groups, output=results_path, collectjobID=collectjobID, defaults=defaults)
Expand Down
95 changes: 95 additions & 0 deletions iLoop_RNAseq_pipeline/scripts/collect_align_summaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python

import pandas as pd
import json
import os
import argparse
import logging

logger = logging.getLogger(__name__)

parser = argparse.ArgumentParser(description='This script will collect align summaries based on hisat2 run.')

parser.add_argument('-p', '--project-path', help='Path to project folder')
parser.add_argument('-g', '--groups-json', help='json file for experiment setup with full path.')
parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./results/align_summaries.tsv')
parser.add_argument('-m', '--map-to-mask', help='Collect align summaries for mask only run.', action='store_true', default=False)
args = parser.parse_args()

groups = json.load(open(args.groups_json, 'r'))

if args.map_to_mask:
pp = os.path.abspath(os.path.join(args.project_path, 'map_to_mask')) # pp: path prefix
else:
pp = os.path.abspath(args.project_path) # pp: path prefix
for sample in [sample for group in groups.values() for sample in group.keys()]:
with open(os.path.join(pp, sample, 'align_summary.txt'), 'r') as f:
for line in f.readlines():
if line.startswith('Left'): # it is tophat summary
sum_type = 'tophat2'
if 'df' not in locals():
df = pd.DataFrame(columns=['Left_reads', 'Mapped_Left_reads', 'Percent_mapped_Left_reads',
'Multiple_alignments_Left_reads',
'Percent_multiple_alignments_Left_reads',
'Right_reads', 'Mapped_Right_reads', 'Percent_mapped_Right_reads',
'Multiple_alignments_Right_reads',
'Percent_multiple_alignments_Right_reads',
'Aligned_pairs', 'Multiple_alignments_pairs',
'Percent_multiple_alignments_pairs', 'Concordant_pair_alignment_rate'])
elif line.endswith('reads; of these:\n'): # it is hisat2 summary
sum_type = 'hisat2'
if 'df' not in locals():
df = pd.DataFrame(columns=['Reads', 'Paired_reads', 'Pairs_not_concordantly_aligned',
'Pairs_concordantly_aligned_once', 'Pairs_concordantly_aligned_multiple',
'Pairs_discordantly_aligned_once',
'Remaining_singles_not_aligned', 'Remaining_singles_aligned_once',
'Remaining_singles_aligned_multiple', 'Overall_alignment_percentage'])

if sum_type == 'tophat2':
if line.startswith('Left'):
side = 'Left'
elif line.startswith('Right'):
side = 'Right'
elif line.startswith('Aligned'):
side = False

if line.startswith(' Input :') and side != False:
df.loc[i, side + '_reads'] = line.split()[2]
elif line.startswith(' Mapped :') and side != False:
df.loc[i, 'Mapped_' + side + '_reads'] = line.split()[2]
df.loc[i, 'Percent_mapped_' + side + '_reads'] = line.split('(')[1].split('%')[0]
elif line.startswith(' of these:') and side != False:
df.loc[i, 'Multiple_alignments_' + side + '_reads'] = line.split()[2]
df.loc[i, 'Percent_multiple_alignments_' + side + '_reads'] = line.split('(')[1].split('%')[0]
elif side == False and line.startswith('Aligned pairs:'):
df.loc[i, 'Aligned_pairs'] = line.split()[2]
elif side == False and line.startswith(' of these:'):
df.loc[i, 'Multiple_alignments_pairs'] = line.split()[2]
df.loc[i, 'Percent_multiple_alignments_pairs'] = line.split('(')[1].split('%')[0]
elif side == False and line.endswith('concordant pair alignment rate.\n'):
df.loc[i, 'Concordant_pair_alignment_rate'] = line.split('%')[0]

elif sum_type == 'hisat2':
if line.endswith('reads; of these:\n'):
df.loc[i, 'Reads'] = line.split()[0]
elif line.endswith('were paired; of these:\n'):
df.loc[i, 'Paired_reads'] = line.split()[0]
elif line.endswith('aligned concordantly 0 times\n'):
df.loc[i, 'Pairs_not_concordantly_aligned'] = line.split()[0]
elif line.endswith('aligned concordantly exactly 1 time\n'):
df.loc[i, 'Pairs_concordantly_aligned_once'] = line.split()[0]
elif line.endswith('aligned concordantly >1 times\n'):
df.loc[i, 'Pairs_concordantly_aligned_multiple'] = line.split()[0]
elif line.endswith('aligned discordantly 1 time\n'):
df.loc[i, 'Pairs_discordantly_aligned_once'] = line.split()[0]
elif line.endswith('aligned 0 times\n'):
df.loc[i, 'Remaining_singles_not_aligned'] = line.split()[0]
elif line.endswith('aligned exactly 1 time\n'):
df.loc[i, 'Remaining_singles_aligned_once'] = line.split()[0]
elif line.endswith('aligned >1 times\n'):
df.loc[i, 'Remaining_singles_aligned_multiple'] = line.split()[0]
elif line.endswith('overall alignment rate\n'):
df.loc[i, 'Overall_alignment_percentage'] = line.split('%')[0]


df.to_csv('align_summaries.csv', sep=' ')
2 changes: 1 addition & 1 deletion iLoop_RNAseq_pipeline/scripts/featureCounts_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

parser.add_argument('-p', '--project-path', help='Path to project folder')
parser.add_argument('-g', '--groups-json', help='json file for experiment setup with full path.')
parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./htseq/collected_counts')
parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./featureCounts/collected_counts')
args = parser.parse_args()

groups = json.load(open(args.groups_json, 'r'))
Expand Down

0 comments on commit 01043f3

Please sign in to comment.