-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenomescan_3groups.pl
executable file
·112 lines (94 loc) · 4.69 KB
/
genomescan_3groups.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env perl
use strict;
use warnings;
use Classes::Fasta;
use Classes::Regions;
use Classes::Missing;
use Classes::Genotypes;
# written by Alexander Nater, September 2016
# Set your options here:
my $workdir="/home/ubuntu/output/genomescans";
my $fasta_file="/data/data/refgenom/PonAbe_7x_callable.fasta";
my $fasta_index=$fasta_file . ".fai";
my $allsitesfolder="/data/data/vcf/allSites";
my $allsiteslist="$workdir/allsiteslist_3groups.txt";
my $vcffolder="/data/data/vcf/SNPs";
my $vcflist="$workdir/vcflist_3groups.txt";
my $npopulations=5;
my @poplabels=("PP", "ST", "NT", "HS", "PT");
my @nindividuals=(20, 2, 14, 2, 2);
my @minind=(2, 2, 2, 1, 1);
my @popgroups=( [0], [1], [2], [3,4] );
my @grouplabels=("PP", "ST", "NT", "Outgroups");
my $maxsize=1000000000;
my $minlength=100000; # minimum length of chromosome/scaffold to be considered.
my $minmapingquality=20;
my $minvariantquality=0;
my $mingenotypequality=0; # this setting is questionable, as it biases towards lower variability!
my $mincoverage=5;
my $MAFfilter=0;
my $excludeRefN=0; # FALSE=0/TRUE=1 - exclude sites with N in reference sequence.
my $excludenonbiallelic=1; # FALSE=0/TRUE=1 - exclude sites with more than two alleles in population pair.
my $haplotize=0; # FALSE=0/TRUE=1 - only take one random allele per individual.
my $minpropcovered=0.3; # minimum proportion of valid sites per window.
my $outfolder=shift @ARGV;
my $outprefix=shift @ARGV;
my $chromosome=shift @ARGV;
my $subindstring=shift @ARGV;
my $windowsize=shift @ARGV;
my $stepsize=shift @ARGV;
my $verbose=shift @ARGV;
#-----------------------------------------------------------------------------------------------------------------------------
$outfolder=~s{/\z}{}; # remove trailing slash from folder path.
mkdir $outfolder unless (-d "$outfolder");
my $outfile="$outprefix" . "_" . "$chromosome" . ".bed";
# obtain array of individual indices and array of subpop labels:
my ($ref_indnames, $ref_files, $ref_samples)=Misc::getIndnames($allsitesfolder, $allsiteslist);
unless (@$ref_files==@nindividuals){ die "Wrong number of populations in popsizes array!\n" }
my ($ref_subsamples, $ref_subinds, @subpoplabels);
if ($subindstring eq 'all'){ $ref_subsamples=$ref_samples; $ref_subinds=$ref_indnames; @subpoplabels=@poplabels }
else {
($ref_subsamples, $ref_subinds)=Misc::getIndicesfromIndnames($ref_indnames, $ref_samples, [ split(',', $subindstring) ] );
for my $pop (0..$#{ $ref_subsamples }){
push @subpoplabels, $poplabels[$pop] if (@{ $ref_subsamples->[$pop] });
$nindividuals[$pop]=@{ $ref_subsamples->[$pop] };
}
}
print STDERR join(',', @subpoplabels), "\n";
# prepare new arrays for population settings for subset of populations with selected individuals:
my @subpopindices=Misc::setSubArrays(\@subpoplabels, \@poplabels, \@nindividuals, \@minind, \@popgroups, \@grouplabels);
print "Selected population indices: ", join(',', @subpopindices), "\nNumber of individuals per population: ", join(',', @nindividuals), "\nMinimum number of individuals per population: ", join(',', @minind), "\n";
print "$grouplabels[$_]: ", join(',', @{ $popgroups[$_] }), "\n" foreach (0..$#popgroups);
print STDERR join(',', @nindividuals), "\n";
# prepare reference fasta file:
my $fasta=Fasta->new();
$fasta->readIndex($fasta_index, $fasta_file);
my $regions=$fasta->locifromIndex();
my $windows=$regions->windowingLoci($minlength, $windowsize, $stepsize, 0, $chromosome);
# prepare bed files of regions to be excluded:
# my $genes=Regions->new();
# $genes->readBED($genes_file, 'onebased', 0);
# regions from file:
# my $windows=Regions->new();
# $windows->readBED($regions_file, 'onebased', 1);
SUBSET: while (1){
my $subset=$windows->subsetLoci($maxsize);
$subset->printLoci("-");
unless ( $subset->getSize() ){ last SUBSET }
my $concat=$subset->concatRegions();
$concat->addRefseq($fasta) if $excludeRefN;
$concat->printLoci("-");
# acquire missing data:
my $missing=Missing->new($allsitesfolder, $allsiteslist);
$missing->setPops(\@subpopindices, $ref_subsamples);
$missing->readAllsites($concat, $minmapingquality, $minvariantquality, \@nindividuals, $excludeRefN, $verbose);
# acquire genotype data:
my $genotypes=Genotypes->new($vcffolder, $vcflist);
$genotypes->setPops(\@subpopindices, $ref_subsamples);
$genotypes->readvcfList($concat, $mingenotypequality, 0, 0, 0, 0, 0, $verbose);
# calculate and print summary statistics:
# $genotypes->readOutgroups($outgroupfolder, $outgroupvcflist, $concat);
$subset->calculateSS($genotypes, $missing, \@popgroups, $mincoverage, $minpropcovered, \@minind, $haplotize, $MAFfilter, $excludenonbiallelic);
# $windows->storeSFS($concat, $genotypes, $missing, \@popgroups, $mincoverage, \@minind, $haplotize);
$subset->printLociSS("$outfolder/$outfile", 1);
}