-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path0_initial_parser.pl
102 lines (79 loc) · 2.87 KB
/
0_initial_parser.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl -w
#
# INITIAL PARSER
#
# Version 2.0 January 2018
#
# Juan Carlos González Sánchez & Nicola Bordin
# Centro Andaluz de Biología del Desarrollo (CABD)
# Universidad Pablo de Olavide, Sevilla
#
# The script takes the protein sequences FASTA-format file which constitute the primary INPUT of the pipeline and extract basic info:
# - Protein name or ID, and description (from FASTA header)
# - Belonging organism (if possible)
# - Protein length
# - Total number of proteins
#
# Usage:
# ~$ perl <this_script.pl> <input_file> identifier
use strict;
use warnings;
use Cwd;
use Compress::Raw::Zlib;
scalar (@ARGV) == 2 or die $!;
# DEFINE VARIABLES #####################################################################
my $input_file = $ARGV[0];
my $identifier = $ARGV[1];
my $output_file = "00_ini_".$identifier.".tsv.gz";
my %proteins;
my ($name, $sequence, $organism, $i, $total_pts) = ("","","",0,0);
########################################################################################################################################
# NCBI FASTA header:
# >gi|168697872|ref|ZP_02730149.1| hypothetical protein GobsU_00010 [Gemmata obscuriglobus UQM 2246]
#
# UniProt FASTA header:
# >sp|B2UQS2|ACP_AKKM8 Acyl carrier protein OS=Akkermansia muciniphila (strain ATCC BAA-835) GN=acpP PE=3 SV=1
open (IN, "$input_file");
while (my $line = <IN>) {
# NCBI like header
if ($line =~ /^>([^\s]+)\s+(.+)OS=(.+)\sGN=/) {
$i++;
$name =$1;
$proteins{$name}{"description"} = $2;
$proteins{$name}{"organism"} = $3;
$proteins{$name}{"order"} = $i;
$proteins{$name}{"sequence"} = '';
# UniProt like header
} elsif ($line =~ /^>([^\s]+)\s+(.+)\[(.+)\]/) {
$i++;
$name =$1;
$proteins{$name}{"description"} = $2;
$proteins{$name}{"organism"} = $3;
$proteins{$name}{"order"} = $i;
$proteins{$name}{"sequence"} = '';
# Other header (not possible to extract info)
} elsif ($line =~ /^>(.+)/) {
$i++;
$name = $1;
$proteins{$name}{"description"} = " - ";
$proteins{$name}{"organism"} = "unknown organism";
$proteins{$name}{"order"} = $i;
$proteins{$name}{"sequence"} = '';
} elsif ($line =~ /^\w/) {
$proteins{$name}{"sequence"} .= $line;
chomp $proteins{$name}{"sequence"};
$proteins{$name}{"length"} = length $proteins{$name}{"sequence"};
}
}
close (IN);
## Keep total number of proteins (for future use)
$total_pts = $i; $i=0;
# CREATE OUTPUT FILE(s)
open (OUT, " | gzip > $output_file") or die $!;
## Print extracted info in an EXCEL-like table (columns separated by "tab").
print OUT "NAME / ID\tDescription\tOrganism\tProtein Length (aa)\n";
foreach my $name (sort {$proteins{$a}{"order"} <=> $proteins{$b}{"order"}} keys %proteins) {
print OUT $name,"\t",$proteins{$name}{"description"},"\t",$proteins{$name}{"organism"},"\t",$proteins{$name}{"length"},"\n";#,$proteins{$name}{"sequence"},"\n";
}
close (OUT);
exit 7;