diff --git a/scripts/tcga_boxplot/README b/scripts/tcga_boxplot/README index 1b51fac..5349b5b 100644 --- a/scripts/tcga_boxplot/README +++ b/scripts/tcga_boxplot/README @@ -1,17 +1,14 @@ ---normalize_gene_expression.pl - The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA +--quartile_norm.pl +The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA + +--filter_genes.py +Usage: python filter_genes.py <{sample}.genes.results.normalized.header.txt> > {sample}.genes.results.normalized.genes_filtered.txt +Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol. + The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest + The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script ---genename_to_HUGO.py - Purpose: Changes the ENSEMBL geneID to HUGO symbol. - Usage scenario: This script is used in RNASeq snakemake pipeline. The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script - Usage example: genename_to_HUGO.py > OB225.genes.results.normalized.genes_filtered.txt - --------------------- ----------------------------------------- ------------------------------------------------- - genes_of_interest.txt | OB225.genes.results.normalized.header.txt | OB225.genes.results.normalized.genes_filtered.txt - --------------------- ----------------------------------------- ------------------------------------------------- - AKT1,ENSG00000142208 gene_id OB225 gene_id OB225 - ENSG00000142208.11 1067.7043 AKT1 1067.7043 --parse_rna_tcga_pat_data.r - Command line script for generating individual expression context figures from NEXUS +Command line script for generating individual expression context figures from NEXUS ---variant_expression_context.TCGA.list_of_genes.R - Command line script for generating individual expression context figures from NEXUS +--variant_expression_context.TCGA.list_of_genes.R +Command line script for generating individual expression context figures from NEXUS diff --git a/scripts/tcga_boxplot/filter_genes.py b/scripts/tcga_boxplot/filter_genes.py new file mode 100644 index 0000000..ea8aec3 --- /dev/null +++ b/scripts/tcga_boxplot/filter_genes.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +import sys + +""" +Usage: python filter_genes.py <{sample}.genes.results.normalized.header.txt> > {sample}.genes.results.normalized.genes_filtered.txt +Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol. +The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest +The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script +Author: Tinu Thomas +Date: July, 2019 +""" +genes={} + +# Reading genes of interest file and storing genes in the dictionary 'genes' +with open(sys.argv[1],'r') as fgene: + for line in fgene: + line=line.strip().split(',') + genes[line[1]]=line[0] # stored HUGO symbol + +# Reading the normalized data file and extracting information for genes of interest +with open(sys.argv[2],'r') as fnorm: + for data in fnorm: + data=data.strip().split('\t') + if data[0] == 'gene_id': + print('\t'.join(data)) + else: + if data[0].split('.')[0] in genes: + print('\t'.join([ genes[data[0].split('.')[0]] ] + data[1:] )) # changes the ENSEMBL geneID in the normalized file to HUGO symbol diff --git a/scripts/tcga_boxplot/quartile_norm.pl b/scripts/tcga_boxplot/quartile_norm.pl new file mode 100755 index 0000000..5fdd0b7 --- /dev/null +++ b/scripts/tcga_boxplot/quartile_norm.pl @@ -0,0 +1,102 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; + +my $out = '-'; +my $q = 75; +my @col; +my @also; +my $names = 1; +my $target = 1000; +my $skip = 0; +my $min=1; +GetOptions("quant=i"=>\$q, "target=i"=>\$target, "col=i@"=>\@col, "out=s"=>\$out, "also=i@"=>\@also, "skip=i"=>\$skip, "min=i"=>\$min); + +my $in = shift @ARGV; + +die usage() unless $in && @col; + +open(OUT, ($out eq '-') ? '<&STDOUT' : ">$out") || die "Can't open $out\n"; +open(IN, ($in eq '-') ? '<&STDIN' : $in) || die "Can't open $in\n"; + +@also = (1) if !@also && !grep {$_ eq '1'} @col; + +map {$_--} @col; +map {$_--} @also; + +my @d; +my $cnt = 0; +my $head =''; +while() { + if ($skip) { + --$skip; + $head .= $_; + next; + } + chomp; + my @f = split /\t/; + if ($col[0] eq '-2') { + @col = (1..$#f); + } + for (@col) { + push @{$d[$_]}, $f[$_]; + } + for (@also) { + push @{$d[$_]}, $f[$_]; + } + ++$cnt; +} +for (@col) { + my @t = grep {$_>=$min} @{$d[$_]}; + @t = sort {$a <=> $b} @t; + my $t=quantile(\@t, $q/100); + for (@{$d[$_]}) { + $_= sprintf "%.4f", $target*$_/$t; + } +} + +my @out = (sort {$a <=> $b} (@col, @also)); + +print OUT $head; + +for (my $i=0;$i<$cnt;++$i) { + for my $j (@out) { + print OUT "\t" unless $j == $out[0]; + print OUT $d[$j][$i]; + } + print OUT "\n"; +} + + +sub usage { +<[int($t)]; + if ($t > int($t)) { + return $v + $p * ($a->[int($t)+1] - $v); + } else { + return $v; + } +} diff --git a/setup.py b/setup.py index d929395..61c0e17 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,8 @@ 'scripts/hypoxia/tupro_pipeline_hypoxia.R', 'scripts/hypoxia/hypoxia_plots.r', 'scripts/tcga_boxplot/variant_expression_context.TCGA.list_of_genes.R', - 'scripts/tcga_boxplot/parse_rna_tcga_pat_data.r' + 'scripts/tcga_boxplot/parse_rna_tcga_pat_data.r', + 'scripts/tcga_boxplot/quartile_norm.pl' ], install_requires=requirements, license="MIT license",