-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathget_final_cat_gtf.pl
98 lines (85 loc) · 2.29 KB
/
get_final_cat_gtf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/perl
#############################################
#Author: Jiang Li
#email: riverlee2008@gmail.com
#Creat Time: Tue 23 Oct 2012 01:37:54 PM CDT
#Vanderbilt Center for Quantitative Sciences
#############################################
use strict;
use warnings;
=pod
=head1 SYNOPSIS
Merge the mRNA gtf (exon) and CDS gtf (exon) together by fill the values of transcript_id and transcript_name in the CDS gtf file. This help define the relationship between transcript_id and protein_id, the relationship is parsed from the proteinid.gb file
=head1 USAGE
perl get_final_cat_gtf.pl
=head1 RESULTS
In your current working directory. There will generate a file named as felCat5_final.gtf
=cut
#1) Get all the transcript id
my %transcript;
my @gtfs=<chr??.gtf>;
push @gtfs,<chr?.gtf>;
foreach my $f (@gtfs){
open(IN,$f) or die $!;
while(<IN>){
if(/transcript_id "(.*?)"/){
#$transcript{$1}++;
my $long=$1;
my $short=$long;
$short=~s/\..*//g;
$transcript{$short}=$long;
}
}
}
print "There are total ",scalar(keys %transcript)," transcripts\n";
#2) get the np to nx mapping
my @nps=<NP/*.gb>;
my %np2nm;
foreach my $f (@nps){
my $np = $f;
$np=~s/NP\/|\.gb//g;
open(IN,$f) or die $!;
my $s="";
while(<IN>){$s.=$_;};
my $nm="";
if($s=~/DBSOURCE\s+REFSEQ:\s+accession\s+(.*?)\n/){
$nm=$1;
}
$nm=~s/\..*//g;
if(!exists($transcript{$nm})){
print $np,"\t$nm\n";
}
$np2nm{$np}=$nm;
}
#3) Write out all
open(OUT,">felCat5_final.gtf") or die $!;
foreach my $f (@gtfs){
open(IN,$f) or die $!;
while(<IN>){
print OUT $_;
}
}
my @cds=<chr*_CDS.gtf>;
#print join "\n",@cds;
foreach my $f (@cds){
open(IN,$f) or die $!;
while(<IN>){
my $np="";
if(/protein_id "(.*?)"/){
$np=$1;
}
my $nmshort=$np2nm{$np};
if(!defined($nmshort)){
print "$np\t$_";
next;
}
if(exists($transcript{$nmshort})){
my $nm=$transcript{$nmshort};
s/transcript_id ".*?"/transcript_id "$nm"/g;
s/transcript_name ".*?"/transcript_name "$nm"/g;
print OUT $_;
}
}
}
close OUT ;
system("sort -k1,4,3 felCat5_final.gtf -0 felCat5_final.gtf");