-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstripSequences.pl
executable file
·97 lines (85 loc) · 3.74 KB
/
stripSequences.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env perl
# Filename: stripSequences.pl
# Description: Removes extraneous sequences from FASTA.
#
# Date dedicated: 2023-02-10
# Author: Samuel S. Shepard, Centers for Disease Control and Prevention
#
# Citation: Unpublished
#
# =============================================================================
#
# PUBLIC DOMAIN NOTICE
#
# This source code file or script constitutes a work of the United States
# Government and is not subject to domestic copyright protection under 17 USC §
# 105. This file is in the public domain within the United States, and
# copyright and related rights in the work worldwide are waived through the CC0
# 1.0 Universal public domain dedication:
# https://creativecommons.org/publicdomain/zero/1.0/
#
# The material embodied in this software is provided to you "as-is" and without
# warranty of any kind, express, implied or otherwise, including without
# limitation, any warranty of fitness for a particular purpose. In no event
# shall the Centers for Disease Control and Prevention (CDC) or the United
# States (U.S.) government be liable to you or anyone else for any direct,
# special, incidental, indirect or consequential damages of any kind, or any
# damages whatsoever, including without limitation, loss of profit, loss of
# use, savings or revenue, or the claims of third parties, whether or not CDC
# or the U.S. government has been advised of the possibility of such loss,
# however caused and on any theory of liability, arising out of or in
# connection with the possession, use or performance of this software.
#
# Please provide appropriate attribution in any work or product based on this
# material.
use English qw(-no_match_vars);
use Getopt::Long;
use Carp qw(croak);
use warnings;
use strict;
my ( $fixHeader, $stripLower, $stripBadBases ) = ( 0, 0, 0 );
GetOptions( 'fix-header|F' => \$fixHeader, 'strip-lower|L' => \$stripLower, 'remove-bad-bases|N' => \$stripBadBases );
if ( ( ( $stripLower || $stripBadBases ) && scalar @ARGV != 1 )
|| ( ( !$stripLower && !$stripBadBases ) && scalar @ARGV != 2 )
|| ( $stripLower && $stripBadBases ) ) {
die( "\nUsage:\n\t$PROGRAM_NAME <file.fas> {-N|-L|<quoted_characters_to_delete>}\n"
. "\t\t-N|--remove-bad-bases\tRemoves invalid nucleotide characters from the sequence.\n"
. "\t\t-L|--strip-lower\tRemoves lowercase letters from the sequence.\n"
. "\t\t-F|--fix-header\t\tRemoves and replaces troublesome characters from the FASTA header.\n"
. "\n" );
}
open( my $IN, '<', $ARGV[0] ) or die("$PROGRAM_NAME ERROR: Cannot open $ARGV[0].\n");
# PREPARE the strip deletion
my $strip;
if ( scalar @ARGV > 1 ) {
$strip = quotemeta( $ARGV[1] ); # save input
$strip = '$sequence =~ tr/' . $strip . '//d; 1;'; # create safe eval
}
local $RS = ">";
while ( my $fasta_record = <$IN> ) {
chomp($fasta_record);
my @lines = split( /\r\n|\n|\r/smx, $fasta_record );
my $header = shift(@lines);
my $sequence = join( q{}, @lines );
if ( length($sequence) == 0 ) { next; }
if ($fixHeader) {
$header =~ s/^\s*(.*?)\s*$/$1/smx;
$header =~ s/[\s:]/_/gsmx;
$header =~ tr/',//d;
}
if ($stripLower) {
$sequence =~ tr/[a-z]//d;
} elsif ($stripBadBases) {
# Only IUPAC codes allowed for bases.
# See: https://www.bioinformatics.org/sms/iupac.html
$sequence =~ s/[^gcatrykmbvdhunswGCATRYKMBVDHUNSW.-]//gsmx;
} else {
eval($strip) or croak("Error in eval: $strip\n");
}
if ( length($sequence) == 0 ) {
next;
} else {
print STDOUT '>', $header, "\n", $sequence, "\n";
}
}
close $IN or croak("Could not close file: $OS_ERROR\n");