Skip to content

Commit

Permalink
Allow rescaling features, i.e. to average TADs
Browse files Browse the repository at this point in the history
  • Loading branch information
Phlya committed Jan 11, 2019
1 parent e06bac8 commit 5c8b8a8
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 67 deletions.
174 changes: 108 additions & 66 deletions coolpup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@
import os
from natsort import index_natsorted, order_by_index
from scipy import sparse
from mirnylib import numutils

def cornerCV(amap, i=4):
corners = np.concatenate((amap[0:i, 0:i], amap[-i:, -i:]))
corners[~np.isfinite(corners)]=0
return np.std(corners)/np.mean(corners)

def normCis(amap, i=3):
return amap/np.nanmean((amap[0:i, 0:i]+amap[-i:, -i:]))*2
#def normCis(amap, i=3):
# return amap/np.nanmean((amap[0:i, 0:i]+amap[-i:, -i:]))*2

def get_enrichment(amap, n):
c = int(np.floor(amap.shape[0]/2))
Expand All @@ -37,41 +38,51 @@ def get_mids(intervals, combinations=True):
if combinations:
intervals = intervals.sort_values(['Chromosome', 'Start'])
mids = np.round((intervals['End']+intervals['Start'])/2).astype(int)
widths = np.round((intervals['End']-intervals['Start'])).astype(int)
mids = pd.DataFrame({'Chromosome':intervals['Chromosome'],
'Mids':mids}).drop_duplicates()
'Mids':mids,
'Pad':widths/2}).drop_duplicates()
else:
intervals = intervals.sort_values(['Chromosome1', 'Chromosome2',
'Start1', 'Start2'])
mids1 = np.round((intervals['End1']+intervals['Start1'])/2).astype(int)
widths1 = np.round((intervals['End1']-intervals['Start1'])).astype(int)
mids2 = np.round((intervals['End2']+intervals['Start2'])/2).astype(int)
widths2 = np.round((intervals['End2']-intervals['Start2'])).astype(int)
mids = pd.DataFrame({'Chromosome1':intervals['Chromosome1'],
'Mids1':mids1,
'Pad1':widths1/2,
'Chromosome2':intervals['Chromosome2'],
'Mids2':mids2}).drop_duplicates()
'Mids2':mids2,
'Pad2':widths2/2}).drop_duplicates()
return mids

def get_combinations(mids, res, local=False, anchor=None):
if local and anchor:
raise ValueError("Can't have a local pileup with an anchor")
m = (mids['Mids']//res).values.astype(int)
p = (mids['Pad']//res).values.astype(int)
if local:
for i in m:
yield i, i
for i, pi in zip(m, p):
yield (i, i), (pi, pi)
elif anchor:
anchor_bin = int((anchor[1]+anchor[2])/2//res)
for i in m:
yield anchor_bin, i
anchor_pad = (anchor[2] - anchor[1])/2
for i, pi in zip(m, p):
yield (anchor_bin, i), (anchor_pad, pi)
else:
for i in itertools.combinations(m, 2):
yield i
for i in zip(itertools.combinations(m, 2), itertools.combinations(p, 2)):
yield i

def get_positions_pairs(mids, res):
m1 = (mids['Mids1']//res).astype(int).values
m2 = (mids['Mids2']//res).astype(int).values
for a, b in zip(m1, m2):
yield a, b
p1 = (mids['Pad1']//res).astype(int).values
p2 = (mids['Pad2']//res).astype(int).values
for a, b in zip(m1, m2, p1, p2):
yield a, b, p1, p2

def controlLoops(midcombs, res, minshift=10**5, maxshift=10**6, nshifts=1):
def controlRegions(midcombs, res, minshift=10**5, maxshift=10**6, nshifts=1):
minbin = minshift//res
maxbin = maxshift//res
for start, end in midcombs:
Expand All @@ -81,11 +92,13 @@ def controlLoops(midcombs, res, minshift=10**5, maxshift=10**6, nshifts=1):
shift *= sign
yield start+shift, end+shift

def averageLoops(chrom, c, mids, pad=7, ctrl=False, local=False,
minshift=10**5, maxshift=10**6, nshifts=1,
mindist=0, maxdist=10**9, combinations=True, anchor=None,
unbalanced=False, cov_norm=False):

def pileups(chrom, c, mids, pad=7, ctrl=False, local=False,
minshift=10**5, maxshift=10**6, nshifts=1,
mindist=0, maxdist=10**9, combinations=True, anchor=None,
unbalanced=False, cov_norm=False,
rescale=False, rescale_pad=50, size=41):
if local:
data = c.matrix(sparse=True, balance=bool(1-unbalanced)).fetch(chrom).tocsr()
data = sparse.triu(c.matrix(sparse=True, balance=bool(1-unbalanced)).fetch(chrom), 2).tocsr()
if unbalanced and cov_norm:
coverage = np.nan_to_num(np.ravel(np.sum(data, axis=0)))
Expand All @@ -95,11 +108,13 @@ def averageLoops(chrom, c, mids, pad=7, ctrl=False, local=False,
print(anchor)
else:
anchor = None

mymap = np.zeros((2*pad + 1, 2*pad + 1), np.float64)
if rescale:
mymap = np.zeros((size, size), np.float64)
else:
mymap = np.zeros((2*pad + 1, 2*pad + 1), np.float64)
if unbalanced and cov_norm:
cov_start = np.zeros(2*pad+1)
cov_end = np.zeros(2*pad+1)
cov_start = np.zeros(mymap.shape[0])
cov_end = np.zeros(mymap.shape[1])

if combinations:
current = mids[mids["Chromosome"] == chrom]
Expand All @@ -112,31 +127,38 @@ def averageLoops(chrom, c, mids, pad=7, ctrl=False, local=False,

if ctrl:
if combinations:
current = controlLoops(get_combinations(current, c.binsize, local,
current = controlRegions(get_combinations(current, c.binsize, local,
anchor),
c.binsize, minshift, maxshift, nshifts)
else:
current = controlLoops(get_positions_pairs(current, c.binsize),
current = controlRegions(get_positions_pairs(current, c.binsize),
c.binsize, minshift, maxshift, nshifts)
else:
if combinations:
current = get_combinations(current, c.binsize, local, anchor)
else:
current = get_positions_pairs(current, c.binsize)
n = 0
for stBin, endBin in current:
# if not local and abs(endBin - stBin) < pad*2 or :
# continue
for (stBin, endBin), (stPad, endPad) in current:
if rescale:
stPad = stPad + int(round(rescale_pad*2*stPad))
endPad = endPad + int(round(rescale_pad*2*endPad))
else:
stPad = pad
endPad = pad
if mindist <= abs(endBin - stBin)*c.binsize < maxdist or local:
try:
mymap += np.nan_to_num(data[stBin - pad:stBin + pad + 1,
endBin - pad:endBin + pad + 1].toarray())
newmap = np.nan_to_num(data[stBin - stPad:stBin + stPad + 1,
endBin - endPad:endBin + endPad + 1].toarray())
if rescale:
newmap = numutils.zoomArray(newmap, (size, size))
mymap += newmap
n += 1
except (IndexError, ValueError) as e:
continue
if unbalanced and cov_norm:
cov_start += coverage[stBin - pad:stBin + pad + 1]
cov_end += coverage[endBin - pad:endBin + pad + 1]
cov_start += coverage[stBin - stPad:stBin + stPad + 1]
cov_end += coverage[endBin - endPad:endBin + endPad + 1]
print(chrom, n)
if unbalanced and cov_norm:
coverage = np.outer(cov_start, cov_end)
Expand All @@ -148,7 +170,34 @@ def averageLoops(chrom, c, mids, pad=7, ctrl=False, local=False,
else: #Don't want to get infs and nans
return mymap, n

def averageLoopsByWindow(chrom, mids, c, pad=7, ctrl=False,
def pileupsWithControl(mids, filename, pad, nproc, chroms, local,
minshift, maxshift, nshifts, mindist, maxdist,
combinations, anchor, unbalanced, cov_norm,
rescale, rescale_pad, size):
c = cooler.Cooler(filename)
p = Pool(nproc)
#Loops
f = partial(pileups, mids=mids, c=c, pad=pad, ctrl=False, local=local,
minshift=minshift, maxshift=maxshift, nshifts=nshifts,
mindist=mindist, maxdist=maxdist, combinations=combinations,
anchor=anchor, unbalanced=unbalanced, cov_norm=cov_norm,
rescale=rescale, rescale_pad=rescale_pad, size=size)
loops, ns = list(zip(*p.map(f, chroms)))
loop = np.average(loops, axis=0, weights=ns) #Weights from how many windows we actually used
#Controls
if nshifts>0:
f = partial(pileups, mids=mids, c=c, pad=pad, ctrl=True, local=local,
minshift=minshift, maxshift=maxshift, nshifts=nshifts,
mindist=mindist, maxdist=maxdist, combinations=combinations,
anchor=anchor, unbalanced=unbalanced, cov_norm=cov_norm,
rescale=rescale, rescale_pad=rescale_pad, size=size)
ctrls, ns = list(zip(*p.map(f, chroms)))
ctrl = np.average(ctrls, axis=0, weights=ns)
loop /= ctrl
p.close()
return loop

def pileupsByWindow(chrom, mids, c, pad=7, ctrl=False,
minshift=10**5, maxshift=10**6, nshifts=1,
mindist=0, maxdist=10**9, unbalanced=False,
cov_norm=False):
Expand All @@ -167,7 +216,7 @@ def averageLoopsByWindow(chrom, mids, c, pad=7, ctrl=False,
return mymaps
for m in curmids['Mids'].values:
if ctrl:
current = controlLoops(get_combinations(curmids, c.binsize,
current = controlRegions(get_combinations(curmids, c.binsize,
anchor=(chrom, m, m)),
c.binsize, minshift, maxshift, nshifts)
else:
Expand All @@ -178,8 +227,6 @@ def averageLoopsByWindow(chrom, mids, c, pad=7, ctrl=False,
cov_end = np.zeros(2*pad+1)
n = 0
for stBin, endBin in current:
# if abs(endBin - stBin) < pad*2:
# continue
if mindist <= abs(endBin - stBin)*c.binsize < maxdist:
try:
mymap += np.nan_to_num(data[stBin - pad:stBin + pad + 1,
Expand All @@ -203,41 +250,19 @@ def averageLoopsByWindow(chrom, mids, c, pad=7, ctrl=False,
mymaps[m] = mymap
return mymaps

def averageLoopsWithControl(mids, filename, pad, nproc, chroms, local,
minshift, maxshift, nshifts, mindist, maxdist,
combinations, anchor, unbalanced, cov_norm):
p = Pool(nproc)
c = cooler.Cooler(filename)
#Loops
f = partial(averageLoops, mids=mids, c=c, pad=pad, ctrl=False, local=local,
minshift=minshift, maxshift=maxshift, nshifts=nshifts,
mindist=mindist, maxdist=maxdist, combinations=combinations,
anchor=anchor, unbalanced=unbalanced, cov_norm=cov_norm)
loops, ns = list(zip(*p.map(f, chroms)))
loop = np.average(loops, axis=0, weights=ns) #Weights from how many windows we actually used
#Controls
f = partial(averageLoops, mids=mids, c=c, pad=pad, ctrl=True, local=local,
minshift=minshift, maxshift=maxshift, nshifts=nshifts,
mindist=mindist, maxdist=maxdist, combinations=combinations,
anchor=anchor, unbalanced=unbalanced, cov_norm=cov_norm)
ctrls, ns = list(zip(*p.map(f, chroms)))
ctrl = np.average(ctrls, axis=0, weights=ns)
p.close()
return loop/ctrl

def averageLoopsByWindowWithControl(mids, filename, pad, nproc, chroms,
def pileupsByWindowWithControl(mids, filename, pad, nproc, chroms,
minshift, maxshift, nshifts, mindist, maxdist,
unbalanced, cov_norm):
p = Pool(nproc)
c = cooler.Cooler(filename)
#Loops
f = partial(averageLoopsByWindow, c=c, mids=mids, pad=pad, ctrl=False,
f = partial(pileupsByWindow, c=c, mids=mids, pad=pad, ctrl=False,
minshift=minshift, maxshift=maxshift, nshifts=nshifts,
mindist=mindist, maxdist=maxdist, unbalanced=unbalanced,
cov_norm=cov_norm)
loops = {chrom:lps for chrom, lps in zip(fchroms, p.map(f, chroms))}
#Controls
f = partial(averageLoopsByWindow, c=c, mids=mids, pad=pad, ctrl=True,
f = partial(pileupsByWindow, c=c, mids=mids, pad=pad, ctrl=True,
minshift=minshift, maxshift=maxshift, nshifts=nshifts,
mindist=mindist, maxdist=maxdist, unbalanced=unbalanced,
cov_norm=cov_norm)
Expand All @@ -257,8 +282,8 @@ def prepare_single(item):
amap = np.zeros_like(amap)
coords = (key[0], int(key[1]//c.binsize*c.binsize),
int(key[1]//c.binsize*c.binsize + c.binsize))
enr1 = get_enrichment(normCis(amap), 1)
enr3 = get_enrichment(normCis(amap), 3)
enr1 = get_enrichment(amap, 1)
enr3 = get_enrichment(amap, 3)
cv3 = cornerCV(amap, 3)
cv5 = cornerCV(amap, 5)
if args.save_all:
Expand Down Expand Up @@ -322,9 +347,19 @@ def prepare_single(item):
parser.add_argument("--local", action='store_true', default=False,
required=False,
help="Create local pileups, i.e. along the diagonal")
parser.add_argument("--rescale", action='store_true', default=False,
required=False,
help="Do not use pad, and rather use the actual feature\
sizes and rescale pileups to the same shape")
parser.add_argument("--rescale_pad", default=1.0, required=False, type=float,
help="If --rescale, padding in fraction of feature length")
parser.add_argument("--size", type=int,
default=100, required=False,
help="If --rescale, this is used to determine the final\
size of the pileup, i.e. it ill be size×size")
parser.add_argument("--subset", default=0, type=int, required=False,
help="Take a random sample of the bed file - useful for\
files with too many featuers to run as is, i..e some\
files with too many featuers to run as is, i.e. some\
repetitive elements. Set to 0 or lower to keep all data.")
parser.add_argument("--unbalanced", action='store_true',
required=False,
Expand Down Expand Up @@ -377,6 +412,10 @@ def prepare_single(item):
else:
incl_chrs = args.incl_chrs.split(',')

if args.by_window and args.rescale:
raise NotImplementedError('Rescaling with by-window pileups is not\
supported')

if args.anchor is not None:
if '_' in args.anchor:
anchor, anchor_name = args.anchor.split('_')
Expand Down Expand Up @@ -427,7 +466,7 @@ def prepare_single(item):
import warnings
warnings.warn("Always using autonaming for by-window pileups")

finloops = averageLoopsByWindowWithControl(mids=mids,
finloops = pileupsByWindowWithControl(mids=mids,
filename=args.coolfile,
pad=pad,
nproc=nproc,
Expand Down Expand Up @@ -462,7 +501,7 @@ def prepare_single(item):
'Enrichments_%s.tsv' % baseoutname),
sep='\t', index=False)
else:
loop = averageLoopsWithControl(mids=mids, filename=args.coolfile,
loop = pileupsWithControl(mids=mids, filename=args.coolfile,
pad=pad, nproc=nproc,
chroms=fchroms, local=args.local,
minshift=args.minshift,
Expand All @@ -473,7 +512,10 @@ def prepare_single(item):
combinations=combinations,
anchor=anchor,
unbalanced=args.unbalanced,
cov_norm=args.coverage_norm)
cov_norm=args.coverage_norm,
rescale=args.rescale,
rescale_pad=args.rescale_pad,
size=args.size)
if args.outname=='auto':
outname = '%s-%sK_over_%s' % (coolname, c.binsize/1000, bedname)
if anchor:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

setup(
name='coolpuppy',
version='0.4',
version='0.5',
scripts=['coolpup.py']
)

0 comments on commit 5c8b8a8

Please sign in to comment.