-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_gencode_annotation
executable file
·108 lines (85 loc) · 2.44 KB
/
clean_gencode_annotation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/perl -w
use warnings;
use strict;
use POSIX;
use Getopt::Long;
use File::Path;
use File::Basename;
use Pod::Usage;
=head1 NAME
clean_gencode_annotation
=head1 SYNOPSIS
clean_gencode_annotation [options]
=head1 DESCRIPTION
Removes unnecessary annotation.
=cut
#option variables
my $help;
my $verbose;
my $debug;
#initialize options
Getopt::Long::Configure ('bundling');
if(!GetOptions ('h'=>\$help, 'v'=>\$verbose, 'd'=>\$debug)
|| scalar(@ARGV)!=1)
{
if ($help)
{
pod2usage(-verbose => 2);
}
else
{
pod2usage(1);
}
}
my $gencodeAnnotationGTFFile = $ARGV[0];
open(GTF, "zcat $gencodeAnnotationGTFFile |") || die "Cannot open $gencodeAnnotationGTFFile";
# //populate interval trees with reference sets
# //gene transfer format
# //1 chr1
# //2 HAVANA - source
# //3 exon - feature
# //4 13221 - start
# //5 14409 - end
# //6 . - score
# //7 + - strand
# //8 . - frame
# //9 gene_id "ENSG - attribute
# //gene_id "ENSG00000149656.4";
# //transcript_id "ENST00000425473.1";
# //gene_type "processed_transcript";
# //gene_status "KNOWN";
# //gene_name "LINC00266-1";
# //transcript_type "processed_transcript";
# //transcript_status "KNOWN";
# //transcript_name "LINC00266-1-002"; exon_number 3;
# //exon_id "ENSE00002450675.1";
# //level 2; havana_gene "OTTHUMG00000033036.2";
# //havana_transcript "OTTHUMT00000080305.1";
while(<GTF>)
{
s/\r?\n?$//;
if (/^#/)
{
print "$_\n";
}
else
{
my ($chrom, $src, $feature, $start, $end, $score, $strand, $frame, $attribute) = split("\t", $_, 10);
my @ATTRIB = split(';', $attribute);
my @newAttribs = ();
for my $attrib (@ATTRIB)
{
$attrib =~ s/^\s+//;
my @fields = split(' ', $attrib);
if ($fields[0] eq "gene_type" ||
$fields[0] eq "gene_name" ||
$fields[0] eq "transcript_type" ||
$fields[0] eq "level")
{
push(@newAttribs, $attrib);
}
}
print "$chrom\t$src\t$feature\t$start\t$end\t$score\t$strand\t$frame\t" . join(';', @newAttribs) . "\n";
}
}
close(GTF);