feat: collect align summaries

meono · Sep 22, 2016 · 01043f3 · 01043f3
1 parent d393297
commit 01043f3
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 4 deletions.
diff --git a/iLoop_RNAseq_pipeline/master_qsub.py b/iLoop_RNAseq_pipeline/master_qsub.py
@@ -210,6 +210,29 @@ def mapandlink_jobs(project_path, sample, reads, defaults, ref, jobs, ppn='8', w
     return '\n\n'.join(jobstr).replace('PPN', ppn)
 
 
+def collect_stats_job(project_path, output, mapjobIDs, defaults, ppn='1', walltime="00:05:00", map_to_mask=False):
+    jobstr = []
+    jobstr += [job_header.replace('JOBNAME', ('collect_stats' if not map_to_mask else 'collect_stats_mask'))
+                   .replace('WALLTIME', walltime)
+                   .replace('PROJECT', defaults['project'])
+                   .replace('DEPEND', (
+        'afterok:{}'.format(':'.join([mapjob for mapjob in mapjobIDs])) if mapjobIDs != [''] else ''))
+                   .replace('JOB_OUTPUTS', abspath(join_path(project_path, 'job_outputs')))
+                   .replace('EMAILADDRESS', defaults['email'])]
+
+    # Pass all environmental variables to the job - this should take care of the virtual environment issue
+    # TODO: clear out virtual environment arguments if this works
+    jobstr += ['#PBS -V']
+
+    jobstr += ['python {}/collect_align_summaries.py -p {} -g {} -o {} {}'.format(abspath(join_path(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
+                                                                                  abspath(project_path),
+                                                                                  abspath(join_path(project_path, 'inputs', 'groups.json')),
+                                                                                  output,
+                                                                                  ('-m' if not map_to_mask else ''))]
+
+    return '\n\n'.join(jobstr).replace('PPN', ppn)
+
+
 def collect_counts_job(project_path, output, mapjobIDs, defaults, ppn='1', walltime="00:05:00"):
     jobstr = []
     jobstr += [job_header.replace('JOBNAME', 'collect_counts')
@@ -227,13 +250,13 @@ def collect_counts_job(project_path, output, mapjobIDs, defaults, ppn='1', wallt
     # this is for htseq-count, which won't be used. For now, featureCounts will be used.
     # jobstr += ['python {}/htseq_count_collector.py -p {} -g {} -o {} '.format(abspath(join_path(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
     #                                                                           abspath(project_path),
-    #                                                                           abspath(join_path(join_path(project_path, 'groups.json'))),
+    #                                                                           abspath(join_path(project_path, 'groups.json')),
     #                                                                           output)]
 
     # line for featureCounts
     jobstr += ['python {}/featureCounts_collector.py -p {} -g {} -o {} '.format(abspath(join_path(iLoop_RNAseq_pipeline.__path__[0], 'scripts')),
                                                                                 abspath(project_path),
-                                                                                abspath(join_path(join_path(project_path, 'inputs', 'groups.json'))),
+                                                                                abspath(join_path(project_path, 'inputs', 'groups.json')),
                                                                                 output)]
 
     return '\n\n'.join(jobstr).replace('PPN', ppn)
@@ -444,16 +467,19 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
     # generate and submit map and link jobs
     if map_to_mask:
         try:
+            maskjobIDs = []
             for group_name, group in groups.items():
                 for sample, reads in group.items():
                     js = mapandlink_jobs(project_path=project_path, sample=sample, reads=reads, ref=ref,
                                          defaults=defaults, ppn=ppn, jobs=jobs, map_to_mask=map_to_mask)
-                    job_submitter(js, job_files_path, 'job_{}_mapandlink_to_mask.sh'.format(sample))
+                    maskjobIDs.append(job_submitter(js, job_files_path, 'job_{}_mapandlink_to_mask.sh'.format(sample)))
         except Exception as ex:
             logger.error(
                 'Problem with map and link. RNAseq analysis is stopped.\nAn exception of type {} occured. Arguments:\n{}'.format(
                     type(ex).__name__, ex.args))
             return False
+    else:
+        maskjobIDs = ['']
     # TODO: a threshold for mapping to mask/genome can be used to decide whether to go forward or stop pipeline. At least generate a warning.
 
     mljobs = ['hisat2', 'stringtie', 'cufflinks', 'htseq-count', 'featureCounts']
@@ -478,6 +504,7 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
     else:
         mapjobIDs = ['']
 
+
     # collect htseq counts - Ignore this since featureCounts is prefered now
     # if any(job for job in jobs if job in ['htseq-count', 'edgeR', 'DESeq', 'htseq-count-collect']) or (jobs == []):
     #     try:
@@ -499,6 +526,22 @@ def job_organizer(project_path, groups, ref, defaults, map_to_mask, ppn='8', rea
                     type(ex).__name__, ex.args))
             return False
 
+    if any(job for job in jobs if job in ['collect_stats', 'hisat2', 'stringtie', 'cufflinks']) or (jobs == []):
+        statjobID = []
+        try:
+            if map_to_mask:
+                js = collect_stats_job(project_path=project_path, output=abspath(join_path(results_path, 'align_summaries_mask.tsv')), mapjobIDs=maskjobIDs, defaults=defaults, map_to_mask=map_to_mask)
+                collectjobID.append(job_submitter(js=js, path=job_files_path, name='job_collect_stats_mask.sh'))
+            js = collect_stats_job(project_path=project_path, output=abspath(join_path(results_path, 'align_summaries.tsv')), mapjobIDs=maskjobIDs, defaults=defaults, map_to_mask=False)
+            statjobID.append(job_submitter(js=js, path=job_files_path, name='job_collect_stats.sh'))
+        except Exception as ex:
+            logger.error(
+                'Problem with collect stats script. RNAseq analysis is stopped.\nAn exception of type {} occured. Arguments:\n{}'.format(
+                    type(ex).__name__, ex.args))
+            return False
+    else:
+        statjobID = ['']
+
     if 'edgeR' in jobs:
         try:
             js = edgeR_job(project_path=project_path, groups=groups, output=results_path, collectjobID=collectjobID, defaults=defaults)

diff --git a/iLoop_RNAseq_pipeline/scripts/collect_align_summaries.py b/iLoop_RNAseq_pipeline/scripts/collect_align_summaries.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import json
+import os
+import argparse
+import logging
+
+logger = logging.getLogger(__name__)
+
+parser = argparse.ArgumentParser(description='This script will collect align summaries based on hisat2 run.')
+
+parser.add_argument('-p', '--project-path', help='Path to project folder')
+parser.add_argument('-g', '--groups-json', help='json file for experiment setup with full path.')
+parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./results/align_summaries.tsv')
+parser.add_argument('-m', '--map-to-mask', help='Collect align summaries for mask only run.', action='store_true', default=False)
+args = parser.parse_args()
+
+groups = json.load(open(args.groups_json, 'r'))
+
+if args.map_to_mask:
+    pp = os.path.abspath(os.path.join(args.project_path, 'map_to_mask')) # pp: path prefix
+else:
+    pp = os.path.abspath(args.project_path) # pp: path prefix
+for sample in [sample for group in groups.values() for sample in group.keys()]:
+    with open(os.path.join(pp, sample, 'align_summary.txt'), 'r') as f:
+        for line in f.readlines():
+            if line.startswith('Left'):  # it is tophat summary
+                sum_type = 'tophat2'
+                if 'df' not in locals():
+                    df = pd.DataFrame(columns=['Left_reads', 'Mapped_Left_reads', 'Percent_mapped_Left_reads',
+                                               'Multiple_alignments_Left_reads',
+                                               'Percent_multiple_alignments_Left_reads',
+                                               'Right_reads', 'Mapped_Right_reads', 'Percent_mapped_Right_reads',
+                                               'Multiple_alignments_Right_reads',
+                                               'Percent_multiple_alignments_Right_reads',
+                                               'Aligned_pairs', 'Multiple_alignments_pairs',
+                                               'Percent_multiple_alignments_pairs', 'Concordant_pair_alignment_rate'])
+            elif line.endswith('reads; of these:\n'):  # it is hisat2 summary
+                sum_type = 'hisat2'
+                if 'df' not in locals():
+                    df = pd.DataFrame(columns=['Reads', 'Paired_reads', 'Pairs_not_concordantly_aligned',
+                                               'Pairs_concordantly_aligned_once', 'Pairs_concordantly_aligned_multiple',
+                                               'Pairs_discordantly_aligned_once',
+                                               'Remaining_singles_not_aligned', 'Remaining_singles_aligned_once',
+                                               'Remaining_singles_aligned_multiple', 'Overall_alignment_percentage'])
+
+            if sum_type == 'tophat2':
+                if line.startswith('Left'):
+                    side = 'Left'
+                elif line.startswith('Right'):
+                    side = 'Right'
+                elif line.startswith('Aligned'):
+                    side = False
+
+                if line.startswith('          Input     :') and side != False:
+                    df.loc[i, side + '_reads'] = line.split()[2]
+                elif line.startswith('           Mapped   :') and side != False:
+                    df.loc[i, 'Mapped_' + side + '_reads'] = line.split()[2]
+                    df.loc[i, 'Percent_mapped_' + side + '_reads'] = line.split('(')[1].split('%')[0]
+                elif line.startswith('            of these:') and side != False:
+                    df.loc[i, 'Multiple_alignments_' + side + '_reads'] = line.split()[2]
+                    df.loc[i, 'Percent_multiple_alignments_' + side + '_reads'] = line.split('(')[1].split('%')[0]
+                elif side == False and line.startswith('Aligned pairs:'):
+                    df.loc[i, 'Aligned_pairs'] = line.split()[2]
+                elif side == False and line.startswith('     of these:'):
+                    df.loc[i, 'Multiple_alignments_pairs'] = line.split()[2]
+                    df.loc[i, 'Percent_multiple_alignments_pairs'] = line.split('(')[1].split('%')[0]
+                elif side == False and line.endswith('concordant pair alignment rate.\n'):
+                    df.loc[i, 'Concordant_pair_alignment_rate'] = line.split('%')[0]
+
+            elif sum_type == 'hisat2':
+                if line.endswith('reads; of these:\n'):
+                    df.loc[i, 'Reads'] = line.split()[0]
+                elif line.endswith('were paired; of these:\n'):
+                    df.loc[i, 'Paired_reads'] = line.split()[0]
+                elif line.endswith('aligned concordantly 0 times\n'):
+                    df.loc[i, 'Pairs_not_concordantly_aligned'] = line.split()[0]
+                elif line.endswith('aligned concordantly exactly 1 time\n'):
+                    df.loc[i, 'Pairs_concordantly_aligned_once'] = line.split()[0]
+                elif line.endswith('aligned concordantly >1 times\n'):
+                    df.loc[i, 'Pairs_concordantly_aligned_multiple'] = line.split()[0]
+                elif line.endswith('aligned discordantly 1 time\n'):
+                    df.loc[i, 'Pairs_discordantly_aligned_once'] = line.split()[0]
+                elif line.endswith('aligned 0 times\n'):
+                    df.loc[i, 'Remaining_singles_not_aligned'] = line.split()[0]
+                elif line.endswith('aligned exactly 1 time\n'):
+                    df.loc[i, 'Remaining_singles_aligned_once'] = line.split()[0]
+                elif line.endswith('aligned >1 times\n'):
+                    df.loc[i, 'Remaining_singles_aligned_multiple'] = line.split()[0]
+                elif line.endswith('overall alignment rate\n'):
+                    df.loc[i, 'Overall_alignment_percentage'] = line.split('%')[0]
+
+
+df.to_csv('align_summaries.csv', sep='	')
diff --git a/iLoop_RNAseq_pipeline/scripts/featureCounts_collector.py b/iLoop_RNAseq_pipeline/scripts/featureCounts_collector.py
@@ -12,7 +12,7 @@
 
 parser.add_argument('-p', '--project-path', help='Path to project folder')
 parser.add_argument('-g', '--groups-json', help='json file for experiment setup with full path.')
-parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./htseq/collected_counts')
+parser.add_argument('-o', '--output', help='Path and prefix of output file. e.g.: ./featureCounts/collected_counts')
 args = parser.parse_args()
 
 groups = json.load(open(args.groups_json, 'r'))