-
Notifications
You must be signed in to change notification settings - Fork 0
/
PlotPSMC.py
197 lines (159 loc) · 7.53 KB
/
PlotPSMC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import matplotlib.pyplot as pplot
import matplotlib.ticker as mtick
import re as regex
from random import random as rnd
import numpy
import os
class OOMFormatter(mtick.ScalarFormatter):
# https://stackoverflow.com/questions/42656139/set-scientific-notation-with-fixed-exponent-and-significant-digits-for-multiple
def __init__(self, order=0, fformat="%1.1f", offset=True, mathText=True):
self.oom = order
self.fformat = fformat
mtick.ScalarFormatter.__init__(self, useOffset=offset, useMathText=mathText)
def _set_orderOfMagnitude(self, nothing):
self.orderOfMagnitude = self.oom
def _set_format(self, vmin, vmax):
self.format = self.fformat
if self._useMathText:
self.format = '$%s$' % mtick._mathdefault(self.format)
def parse_psmc_output(psmcInputList, representAsEffectiveSize):
# TODO: apparently there is some sort of bug because my plots, when compared to Li's psmc_plot.pl plots
# do not exactly match when the same data is used, especially when it comes to the population size.
#
allPsmcData = []
for psmcFiles in psmcInputList:
lastIteration = ""
with open(psmcFiles[0], 'r') as psmcFile:
inBlock = False
timePoints, lambdaPoints, blockPoints = [], [], []
estimatedTheta = n0 = 0
for line in psmcFile:
if line.split()[0] == "MM" and line.split()[1].split(":")[0] == "n_iterations":
# We could also iterate through the whole file and get the maximum RD value
lastIteration = line.split()[1].split(":")[1].strip(",")
if line.split() == ['RD', lastIteration]: # Last iteration
inBlock = True
timePoints, lambdaPoints = [], []
if inBlock and line[:2] == "RS":
timePoints.append(float(line.split('\t')[2]))
lambdaPoints.append(float(line.split('\t')[3]))
elif inBlock and line[:2] == "PA":
inBlock = False
estimatedTheta = float(line.split()[2])
generationTime = psmcFiles[1]
mutRate = psmcFiles[2]
binSize = psmcFiles[3]
n0 = estimatedTheta/(4*mutRate)/binSize
scaledSize = scaledTime = 0
if representAsEffectiveSize:
scaledTime = [generationTime * 2 * n0 * time_k for time_k in timePoints]
scaledSize = [n0 * lambda_k for lambda_k in lambdaPoints]
else:
scaledTime = [time_k * estimatedTheta / binSize for time_k in timePoints]
# pairwiseSequenceDivergence
scaledSize = [(lambda_k * estimatedTheta / binSize)*1e3 for lambda_k in lambdaPoints]
# scaledMutRate
blockPoints.append((scaledTime, scaledSize))
allPsmcData.append(blockPoints)
return allPsmcData
def plotPsmc(listOfOpt, yAsEffectiveSize,
xmin=0, xmax=0,
ymin=0, ymax=0,
transparency=0.1, isXLogScale=True, isYLogScale=False, showLGM=False,
savePlotWithName="myPlot"):
myFigure = pplot.figure(1)
inFigure = myFigure.add_subplot(111)
# Show Last Glacial Maximum?
if showLGM:
inFigure.axvline(
linewidth=10,
alpha=0.25,
label=None, # label="LGM",
x=22000,
color='black')
myData = parse_psmc_output(listOfOpt, representAsEffectiveSize=yAsEffectiveSize)
for i_sample in range(0, len(myData)):
# bootstraped psmc
for j_bootStrap in range(0, len(myData[i_sample])):
inFigure.step(myData[i_sample][j_bootStrap][0],
myData[i_sample][j_bootStrap][1],
color=listOfOpt[i_sample][5],
linewidth=1.0,
alpha=transparency)
# original psmc
inFigure.step(myData[i_sample][0][0],
myData[i_sample][0][1],
color=listOfOpt[i_sample][5],
label=listOfOpt[i_sample][4])
inFigure.legend(loc=0)
myFigure.suptitle("PSMC estimate on real data")
sumAxes = xmin+xmax+ymin+ymax
if isXLogScale:
inFigure.set_xscale("log")
else:
inFigure.set_xscale("linear")
xOoMagnitude = numpy.floor(numpy.log10(numpy.abs(xmax)))
inFigure.xaxis.set_major_formatter(OOMFormatter(xOoMagnitude, "%1.1f"))
if isYLogScale:
inFigure.set_yscale("log")
else:
inFigure.set_yscale("linear")
if yAsEffectiveSize:
pplot.xlabel("Years")
pplot.ylabel("Effective population size")
pplot.title("Y axis scaled as $N_e$")
if sumAxes == 0:
xmin = 1e3; xmax = 1e7; ymin = 0; ymax = 5e4
if ymax <= 1e4:
inFigure.yaxis.set_major_formatter(OOMFormatter(4, "%1.2f"))
elif ymax <= 1e5:
inFigure.yaxis.set_major_formatter(OOMFormatter(4, "%1.1f"))
elif ymax <= 1e7:
inFigure.yaxis.set_major_formatter(OOMFormatter(4, "%1.0f"))
elif ymax > 1e7:
yOoMagnitude = numpy.floor(numpy.log10(numpy.abs(ymax)))
inFigure.yaxis.set_major_formatter(OOMFormatter(yOoMagnitude, "%1.2f"))
else:
pplot.xlabel(r'Time (scaled in units of 2$\mu$T)')
pplot.ylabel("Population size\n(scaled in units of $4\mu N_e\ x\ 10^3$)")
pplot.title("Y axis scaled as $4\mu N_e\ x\ 10^3$")
if sumAxes == 0:
xmin = 1e-6; xmax = 1e-2; ymin = 0; ymax = 5e0
inFigure.yaxis.set_major_formatter(OOMFormatter(0, "%1.0f"))
inFigure.grid(True)
inFigure.set_xlim(xmin, xmax)
inFigure.set_ylim(ymin, ymax)
if not os.path.exists("./Plots"):
os.mkdir("./Plots/")
myFigure.savefig("./Plots/"+savePlotWithName)
myFigure.clf() # close/clear fig so that it doesnt keep using resources
# pplot.close(1) # can't actually close figure because of current conflict with tkinter GUI
def readPsmcOptions(pathToOptionsFile):
psmcOptions = []
with open(pathToOptionsFile, 'r') as psmcOptionsFile:
for line in psmcOptionsFile:
if line[:1] != '#': # if line doesnt start with a '#'
# create a list of options from parameter file
optTokens = regex.findall(r'[^\s\t\"]+', line)
# could do it all in one line but this has better readability
pathToPsmcFile = optTokens[0]
generationTime = float(optTokens[1])
mutationRate = float(optTokens[2])
binSize = float(optTokens[3])
sampleName = optTokens[4]
if len(optTokens) == 6:
lineColor = optTokens[5]
else:
# get random color
lineColor = (rnd(), rnd(), rnd())
psmcOptions.append((pathToPsmcFile,
generationTime,
mutationRate,
binSize,
sampleName,
lineColor))
return psmcOptions
# a list of inputs/options to plot the PSMC curve
# psmc_options = readPsmcOptions("./plotPSMC.csv")
# a = plotPsmc(psmc_options, yAsEffectiveSize=True, xmin=1e4, xmax=1e8, ymin=0, ymax=2e5, transparency=0.15)
# b = plotPsmc(psmc_options, yAsEffectiveSize=False, xmin=1e-7, xmax=1e-2, ymin=0, ymax=5e0)