-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsplitfasta_mod.pl
128 lines (106 loc) · 3.51 KB
/
splitfasta_mod.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#! /usr/bin/env perl
# splitfasta.pl
# Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files
#
# (C) Johannes Soeding, 2012
#
# HHsuite version 2.0.15 (June 2012)
#
# Reference:
# Remmert M., Biegert A., Hauser A., and Soding J.
# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
use lib $ENV{"HHLIB"}."/scripts";
#use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
use strict;
use warnings;
my $ext="seq";
my $usage="
Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files.
Write files into current directory and name each file by the first word after \">\" in the name line.
Usage: splitfasta.pl infile [option]
Option:
-fam : use family-based name (for SCOP/ASTRAL sequences
-name : use sequence name as file name (default)
-ext <ext> : extension for sequence files (default=$ext)
\n";
if (@ARGV<1) {die $usage;;}
my $line;
my $infile=$ARGV[0];
my $outfile;
my $sequence="";
my $options="";
my $fam=0; # option -fam?
my $famid="";
my %numfams=();
my $n=0; # number of name lines read in so far
if (@ARGV>1) {
$options.=join(" ",@ARGV[1..$#ARGV]);
}
# Set number of cpus to use
if ($options=~s/-fam//g) {$fam=1;}
if ($options=~s/-name//g) {$fam=0;}
if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;}
open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n");
if ($fam) {
while ($line=<INFILE>) {
if ($line=~/^>(\S+)\s+(\S+)/) {
$famid=$2;
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1};
$outfile="$fam.".$numfams{$fam}.".seq";
$sequence=$line;
$n++;
} else {
$sequence.=$line;
}
}
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
} else {
my %exists=();
while ($line=<INFILE>) {
if ($line=~/^>(\S+)/) {
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");}
$exists{$1}=1;
my $tmp = $1;
$tmp =~ s/\|/_/g;
$tmp =~ s/\./_/g;
$outfile="$tmp.seq";
$sequence=$line;
$n++;
} else {
$sequence.=$line;
}
}
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
}
close(INFILE);
printf("Created %i sequence files\n",$n);