Skip to content

Commit

Permalink
update all-genes and per-gene mode to use the Python's version to fin…
Browse files Browse the repository at this point in the history
…d thresholds
  • Loading branch information
Mai To Uyen authored and Mai To Uyen committed Jan 10, 2021
1 parent fc2c79c commit c04e29e
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 42 deletions.
56 changes: 14 additions & 42 deletions run_treeshrink.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,7 @@
from treeshrink import set_tmp_dir, get_tmp_dir, get_tmp_file
from treeshrink.util_lib import minVar_bisect
import re
from scipy.stats.mstats import mquantiles
import statsmodels.api as sm
from scipy.stats import gaussian_kde


def find_threshold_lmquantile(data,q):
x = [log(y) for y in data]
t = mquantiles(x,prob=q)
return exp(t)

def find_threshold_lkernel(data,q):
x = sorted([log(y) for y in data])
kernel = gaussian_kde(x,'silverman')

# compute cdf
cdf = [kernel.integrate_box_1d(-float("inf"),x[0])]
for i in range(len(x)-1):
cdf.append(cdf[-1]+kernel.integrate_box_1d(x[i],x[i+1]))
cdf.append(cdf[-1]+kernel.integrate_box_1d(x[-1],float("inf")))

# normalize cdf
s = cdf[-1]
cdf = [y/s for y in cdf]

# find threshold
for i,c in enumerate(cdf):
if c > q:
break

if c > q:
t = exp(x[i-1]) if i >0 else exp(x[0])-1e-4
else:
t = exp(x[-1])+1e-4

return t
from treeshrink.threshold_lib import find_threshold_lkernel,find_threshold_loglnorm

def make_dir(dirName):
if exists(dirName) and isdir(dirName):
Expand Down Expand Up @@ -211,9 +177,10 @@ def main():
for s in mapping:
f.write(str(mapping[s]) + " " +s)
f.write("\n")
thresholds = find_threshold_loglnorm(mapping.values(),[1-float(q) for q in quantiles])
if len(mapping) > 1:
for i,q in enumerate(quantiles):
threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_loglnorm.R")),filename,q]).lstrip().rstrip()[4:])
threshold = thresholds[i]
for s in mapping:
if mapping[s] > threshold:
removing_sets[i][t].append(s)
Expand Down Expand Up @@ -253,11 +220,10 @@ def main():
for v in species_map[s]:
f.write(str(v))
f.write("\n")
thresholds = [ 0 for i in range(len(quantiles)) ]
#thresholds = [ 0 for i in range(len(quantiles)) ]
thresholds = [max(minImpact,thres) for thres in find_threshold_lkernel(species_map[s],[1-float(q) for q in quantiles])]
for i,q in enumerate(quantiles):
thresholds[i] = max(minImpact,find_threshold_lkernel(species_map[s],1.0-float(q)))
#thresholds[i] = max(minImpact,float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:]))
#print(t,thresholds[i])
#thresholds[i] = max(minImpact,find_threshold_lkernel(species_map[s],1.0-float(q)))
if s not in exceptions:
print("%s:\n\t will be cut in %d trees where its impact is above %f for quantile %s" %(s,sum(1 for x in species_map[s] if x>thresholds[i]),thresholds[i],q,))
species_map[s] = (species_map[s],thresholds)
Expand All @@ -277,14 +243,20 @@ def main():
for s,r in gene:
f.write(str(r))
f.write("\n")
v = []
for gene in gene_list:
for s,r in gene:
v.append(r)

thresholds = find_threshold_lkernel(v,[1-float(q) for q in quantiles])

for i,q in enumerate(quantiles):
threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:])
threshold = thresholds[i]
for t,gene in enumerate(gene_list):
for x,r in gene:
#s = g2sp[x] if x in g2sp else x
if r > threshold:
removing_sets[i][t].append(x)

print("Writing output ...\n")

fName,ext = splitext(basename(args["tree"]))
Expand Down
44 changes: 44 additions & 0 deletions treeshrink/threshold_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from scipy.stats import gaussian_kde,lognorm
from math import log, exp
from statistics import mean,stdev

EPS = 1e-5

def find_threshold_loglnorm(data,quantiles):
x = [log(log(y)) for y in data]
mu = mean(x)
sigma = stdev(x)
return [exp(t) for t in lognorm.ppf(quantiles, sigma, scale=exp(mu))]

def find_threshold_lkernel(data,quantiles):
x = sorted([log(y) for y in data])
kernel = gaussian_kde(x,'silverman')

# compute cdf
cdf = [kernel.integrate_box_1d(-float("inf"),x[0])]
for i in range(len(x)-1):
cdf.append(cdf[-1]+kernel.integrate_box_1d(x[i],x[i+1]))
cdf.append(cdf[-1]+kernel.integrate_box_1d(x[-1],float("inf")))

thresholds = [0]*len(quantiles)
for i,q in enumerate(quantiles):
cutoff_idx = __find_cutoff_idx__(cdf,q)
if cutoff_idx < 0:
t = exp(x[0]) - EPS
else:
t = exp(x[cutoff_idx]) + EPS
thresholds[i] = t
return thresholds


def __find_cutoff_idx__(cdf,q):
# normalize cdf
s = cdf[-1]
cdf = [y/s for y in cdf]

# find the cutoff
for i,c in enumerate(cdf):
if c > q:
return i-1
# only get here if q == 1.0
return i-1

0 comments on commit c04e29e

Please sign in to comment.