Skip to content

Commit

Permalink
Merge pull request #19 from apriha/use-build-37-for-processing
Browse files Browse the repository at this point in the history
Use Build 37 for processing; fixes #13
  • Loading branch information
apriha authored Jan 29, 2018
2 parents c652bef + 1536f96 commit 0319d5a
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 142 deletions.
4 changes: 3 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ Remap SNPs
Oops! The data we just loaded is Build 36, but we want Build 37 since the other files in the
datasets are Build 37... Let's remap the SNPs:

>>> user662.remap_snps('NCBI36', 'GRCh37')
>>> user662.assembly
36
>>> user662.remap_snps(37)
Remapping chromosome 1...
Remapping chromosome 2...
Remapping chromosome 3...
Expand Down
36 changes: 18 additions & 18 deletions lineage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def find_discordant_snps(self, individual1, individual2, individual3=None, save_
http://www.math.mun.ca/~dapike/FF23utils/trio-discord.php
"""
self._remap_snps_to_GRCh37([individual1, individual2, individual3])

df = individual1.snps

Expand Down Expand Up @@ -192,7 +193,7 @@ def find_discordant_snps(self, individual1, individual2, individual3=None, save_

return df

def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
def find_shared_dna(self, individual1, individual2, cM_threshold=0.75,
snp_threshold=1100, shared_genes=False):
""" Find the shared DNA between two individuals.
Expand All @@ -206,8 +207,6 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
----------
individual1 : Individual
individual2 : Individual
build : {36, 37}
human genome assembly
cM_threshold : float
minimum centiMorgans for each shared DNA segment
snp_threshold : int
Expand All @@ -216,6 +215,8 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
determine shared genes
"""
self._remap_snps_to_GRCh37([individual1, individual2])

df = individual1.snps

df = df.join(individual2.snps['genotype'], rsuffix='2', how='inner')
Expand All @@ -228,7 +229,7 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
one_x_chrom = self._is_one_individual_male(df, genotype1, genotype2)

# determine the genetic distance between each SNP using HapMap tables
hapmap, df = self._compute_snp_distances(df, build)
hapmap, df = self._compute_snp_distances(df)

# determine where individuals share an allele on one chromosome
df['one_chrom_match'] = np.where(
Expand All @@ -254,22 +255,15 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
two_chrom_shared_dna = self._compute_shared_dna(df, hapmap, 'two_chrom_match',
cM_threshold, snp_threshold, one_x_chrom)

if build == 36:
cytobands = self._resources.get_cytoband_h36()
else:
cytobands = self._resources.get_cytoband_h37()
cytobands = self._resources.get_cytoband_h37()

# plot data
if create_dir(self._output_dir):
plot_chromosomes(one_chrom_shared_dna, two_chrom_shared_dna, cytobands,
os.path.join(self._output_dir, 'shared_dna_' +
individual1.get_var_name() + '_' +
individual2.get_var_name() + '.png'),
individual1.name + ' / ' + individual2.name + ' shared DNA', build)

if shared_genes and build == 36:
print('Remap SNPs to Build 37 to determine shared genes')
shared_genes = False
individual1.name + ' / ' + individual2.name + ' shared DNA', 37)

# save results in CSV format
if len(one_chrom_shared_dna) > 0:
Expand Down Expand Up @@ -359,11 +353,8 @@ def _is_one_individual_male(self, df, genotype1, genotype2, heterozygous_x_snp_t
return False


def _compute_snp_distances(self, df, build):
if build == 36:
hapmap = self._resources.get_hapmap_h36()
else:
hapmap = self._resources.get_hapmap_h37()
def _compute_snp_distances(self, df):
hapmap = self._resources.get_hapmap_h37()

for chrom in df['chrom'].unique():
if chrom not in hapmap.keys():
Expand Down Expand Up @@ -497,6 +488,15 @@ def _compute_shared_dna(self, df, hapmap, col, cM_threshold, snp_threshold, one_
counter += 1
return shared_dna

@staticmethod
def _remap_snps_to_GRCh37(individuals):
for i in individuals:
if i is None:
continue

i.remap_snps(37)


def create_dir(path):
""" Create directory specified by `path` if it doesn't already exist.
Expand Down
31 changes: 20 additions & 11 deletions lineage/individual.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,19 +163,20 @@ def get_var_name(self):
# http://stackoverflow.com/a/3305731
return re.sub('\W|^(?=\d)', '_', self.name)

def remap_snps(self, source_assembly, target_assembly, complement_bases=True):
def remap_snps(self, target_assembly, complement_bases=True):
""" Remap the SNP coordinates of this ``Individual`` from one assembly to another.
This method uses the assembly map endpoint of the Ensembl REST API service (via this
``Individual``'s ``EnsemblRestClient``) to convert SNP coordinates / positions from one
assembly to another. After remapping, the coordinates / positions for the ``Individual``'s
SNPs will be that of the target assembly.
If the SNPs are already mapped relative to the target assembly, remapping will not be
performed.
Parameters
----------
source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
starting assembly of an Individual's SNPs
target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
target_assembly : {'NCBI36', 'GRCh37', 'GRCh38', 36, 37, 38}
assembly to remap to
complement_bases : bool
complement bases when remapping SNPs to the minus strand
Expand All @@ -202,16 +203,24 @@ def remap_snps(self, source_assembly, target_assembly, complement_bases=True):
print('Need an ``EnsemblRestClient`` to remap SNPs')
return

valid_assemblies = ['NCBI36', 'GRCh37', 'GRCh38']
valid_assemblies = ['NCBI36', 'GRCh37', 'GRCh38', 36, 37, 38]

if source_assembly not in valid_assemblies:
print('Invalid source assembly')
return
elif target_assembly not in valid_assemblies:
if target_assembly not in valid_assemblies:
print('Invalid target assembly')
return
elif str(self._assembly) not in source_assembly:
print('Current assembly of SNPs does not match specified source assembly')

if isinstance(target_assembly, int):
if target_assembly == 36:
target_assembly = 'NCBI36'
else:
target_assembly = 'GRCh' + str(target_assembly)

if self._assembly == 36:
source_assembly = 'NCBI36'
else:
source_assembly = 'GRCh' + str(self._assembly)

if source_assembly == target_assembly:
return

for chrom in self._snps['chrom'].unique():
Expand Down
115 changes: 4 additions & 111 deletions lineage/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
http://dx.doi.org/10.1038/35057062
..[4] hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina
Learned, Donna Karolchik and Robert Kuhn, https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19
..[5] hg18 (NCBI36): Engineering effort led by Fan Hsu; QA effort led by Ann Zweig,
https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg18
"""

Expand All @@ -46,11 +44,9 @@
"""

import ftplib
import gzip
import os
import tarfile
import tempfile
import urllib.request
import zlib

Expand All @@ -72,27 +68,11 @@ def __init__(self, resources_dir):
"""
self._resources_dir = os.path.abspath(resources_dir)
self._hapmap_h36 = None
self._hapmap_h37 = None
self._cytoband_h36 = None
self._cytoband_h37 = None
self._knownGene_h37 = None
self._kgXref_h37 = None

def get_hapmap_h36(self):
""" Get International HapMap Consortium HapMap for Build 36.
Returns
-------
dict
dict of pandas.DataFrame HapMap tables if loading was successful, else None
"""
if self._hapmap_h36 is None:
self._hapmap_h36 = self._load_hapmap(self._get_path_hapmap_h36())

return self._hapmap_h36

def get_hapmap_h37(self):
""" Get International HapMap Consortium HapMap for Build 37.
Expand All @@ -107,20 +87,6 @@ def get_hapmap_h37(self):

return self._hapmap_h37

def get_cytoband_h36(self):
""" Get UCSC cytoBand table for Build 36.
Returns
-------
pandas.DataFrame
cytoBand table if loading was successful, else None
"""
if self._cytoband_h36 is None:
self._cytoband_h36 = self._load_cytoband(self._get_path_cytoband_h36())

return self._cytoband_h36

def get_cytoband_h37(self):
""" Get UCSC cytoBand table for Build 37.
Expand Down Expand Up @@ -258,20 +224,9 @@ def _load_hapmap(filename):
try:
hapmap = {}

if '36' in filename:
if '37' in filename:
with tarfile.open(filename, 'r') as tar:
# http://stackoverflow.com/a/2018576
for member in tar.getmembers():
if 'genetic_map' in member.name:
df = pd.read_csv(tar.extractfile(member), sep=' ')
df = df.rename(columns={'position': 'pos',
'COMBINED_rate(cM/Mb)': 'rate',
'Genetic_Map(cM)': 'map'})
start_pos = member.name.index('chr') + 3
end_pos = member.name.index('_b36')
hapmap[member.name[start_pos:end_pos]] = df
elif '37' in filename:
with tarfile.open(filename, 'r') as tar:
for member in tar.getmembers():
if 'genetic_map' in member.name:
df = pd.read_csv(tar.extractfile(member), sep='\t')
Expand Down Expand Up @@ -376,20 +331,6 @@ def _load_kgXref(filename):
print(err)
return None

def _get_path_cytoband_h36(self):
""" Get local path to cytoBand file for hg18 / NCBI36 from UCSC, downloading if necessary.
Returns
-------
str
path to cytoband_h36.txt.gz
"""

return self._download_file(
'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/cytoBand.txt.gz',
'cytoband_h36.txt.gz')

def _get_path_cytoband_h37(self):
""" Get local path to cytoBand file for hg19 / GRCh37 from UCSC, downloading if necessary.
Expand All @@ -406,56 +347,6 @@ def _get_path_cytoband_h37(self):
def _get_url_cytoband_h37():
return 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz'

def _get_path_hapmap_h36(self):
""" Get local path to HapMap for hg18 / NCBI36, downloading if necessary.
Returns
-------
str
path to hapmap_h36.tar.gz
References
----------
..[1] "The International HapMap Consortium (2007). A second generation human haplotype
map of over 3.1 million SNPs. Nature 449: 851-861."
"""
if not lineage.create_dir(self._resources_dir):
return None

hapmap = 'hapmap_h36'
destination = os.path.join(self._resources_dir, hapmap + '.tar.gz')

if not os.path.exists(destination):
try:
# make FTP connection to NCBI
with ftplib.FTP('ftp.ncbi.nlm.nih.gov') as ftp:
ftp.login()
ftp.cwd('hapmap/recombination/2008-03_rel22_B36/rates')

# download each HapMap file and add to compressed tar
with tarfile.open(destination, 'w:gz') as out_tar:
for filename in ftp.nlst():
if '.txt' in filename:
path = os.path.join(destination, hapmap, filename)
self._print_download_msg(path)

# open temp file, download HapMap file, close temp file
with tempfile.NamedTemporaryFile(delete=False) as fp:
ftp.retrbinary('RETR ' + filename, fp.write)

# add temp file to archive
out_tar.add(fp.name, arcname=os.path.join(hapmap, filename))

# remove temp file
os.remove(fp.name)
ftp.quit()
except Exception as err:
print(err)
return None

return destination

def _get_path_hapmap_h37(self):
""" Get local path to HapMap for hg19 / GRCh37, downloading if necessary.
Expand All @@ -466,7 +357,9 @@ def _get_path_hapmap_h37(self):
References
----------
..[1] "The map was generated by lifting the HapMap Phase II genetic map from build 35 to
..[1] "The International HapMap Consortium (2007). A second generation human haplotype
map of over 3.1 million SNPs. Nature 449: 851-861."
..[2] "The map was generated by lifting the HapMap Phase II genetic map from build 35 to
GRCh37. The original map was generated using LDhat as described in the 2007 HapMap
paper (Nature, 18th Sept 2007). The conversion from b35 to GRCh37 was achieved using
the UCSC liftOver tool. Adam Auton, 08/12/2010"
Expand Down
2 changes: 1 addition & 1 deletion lineage/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def plot_chromosomes(one_chrom_match, two_chrom_match, cytobands, path, title, b
path to destination `.png` file
title : str
title for plot
build : {36, 37}
build : {37}
human genome assembly
"""
Expand Down

0 comments on commit 0319d5a

Please sign in to comment.