Merge pull request #19 from apriha/use-build-37-for-processing

Use Build 37 for processing; fixes #13
apriha · Jan 29, 2018 · 0319d5a · 0319d5a
2 parents c652bef + 1536f96
commit 0319d5a
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 142 deletions.
diff --git a/README.rst b/README.rst
@@ -76,7 +76,9 @@ Remap SNPs
 Oops! The data we just loaded is Build 36, but we want Build 37 since the other files in the
 datasets are Build 37... Let's remap the SNPs:
 
->>> user662.remap_snps('NCBI36', 'GRCh37')
+>>> user662.assembly
+36
+>>> user662.remap_snps(37)
 Remapping chromosome 1...
 Remapping chromosome 2...
 Remapping chromosome 3...

diff --git a/lineage/__init__.py b/lineage/__init__.py
@@ -118,6 +118,7 @@ def find_discordant_snps(self, individual1, individual2, individual3=None, save_
           http://www.math.mun.ca/~dapike/FF23utils/trio-discord.php
 
         """
+        self._remap_snps_to_GRCh37([individual1, individual2, individual3])
 
         df = individual1.snps
 
@@ -192,7 +193,7 @@ def find_discordant_snps(self, individual1, individual2, individual3=None, save_
 
         return df
 
-    def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
+    def find_shared_dna(self, individual1, individual2, cM_threshold=0.75,
                         snp_threshold=1100, shared_genes=False):
         """ Find the shared DNA between two individuals.
 
@@ -206,8 +207,6 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
         ----------
         individual1 : Individual
         individual2 : Individual
-        build : {36, 37}
-            human genome assembly
         cM_threshold : float
             minimum centiMorgans for each shared DNA segment
         snp_threshold : int
@@ -216,6 +215,8 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
             determine shared genes
 
         """
+        self._remap_snps_to_GRCh37([individual1, individual2])
+
         df = individual1.snps
 
         df = df.join(individual2.snps['genotype'], rsuffix='2', how='inner')
@@ -228,7 +229,7 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
         one_x_chrom = self._is_one_individual_male(df, genotype1, genotype2)
 
         # determine the genetic distance between each SNP using HapMap tables
-        hapmap, df = self._compute_snp_distances(df, build)
+        hapmap, df = self._compute_snp_distances(df)
 
         # determine where individuals share an allele on one chromosome
         df['one_chrom_match'] = np.where(
@@ -254,22 +255,15 @@ def find_shared_dna(self, individual1, individual2, build=37, cM_threshold=0.75,
         two_chrom_shared_dna = self._compute_shared_dna(df, hapmap, 'two_chrom_match',
                                                         cM_threshold, snp_threshold, one_x_chrom)
 
-        if build == 36:
-            cytobands = self._resources.get_cytoband_h36()
-        else:
-            cytobands = self._resources.get_cytoband_h37()
+        cytobands = self._resources.get_cytoband_h37()
 
         # plot data
         if create_dir(self._output_dir):
             plot_chromosomes(one_chrom_shared_dna, two_chrom_shared_dna, cytobands,
                              os.path.join(self._output_dir, 'shared_dna_' +
                                           individual1.get_var_name() + '_' +
                                           individual2.get_var_name() + '.png'),
-                             individual1.name + ' / ' + individual2.name + ' shared DNA', build)
-
-        if shared_genes and build == 36:
-            print('Remap SNPs to Build 37 to determine shared genes')
-            shared_genes = False
+                             individual1.name + ' / ' + individual2.name + ' shared DNA', 37)
 
         # save results in CSV format
         if len(one_chrom_shared_dna) > 0:
@@ -359,11 +353,8 @@ def _is_one_individual_male(self, df, genotype1, genotype2, heterozygous_x_snp_t
                 return False
 
 
-    def _compute_snp_distances(self, df, build):
-        if build == 36:
-            hapmap = self._resources.get_hapmap_h36()
-        else:
-            hapmap = self._resources.get_hapmap_h37()
+    def _compute_snp_distances(self, df):
+        hapmap = self._resources.get_hapmap_h37()
 
         for chrom in df['chrom'].unique():
             if chrom not in hapmap.keys():
@@ -497,6 +488,15 @@ def _compute_shared_dna(self, df, hapmap, col, cM_threshold, snp_threshold, one_
                 counter += 1
         return shared_dna
 
+    @staticmethod
+    def _remap_snps_to_GRCh37(individuals):
+        for i in individuals:
+            if i is None:
+                continue
+
+            i.remap_snps(37)
+
+
 def create_dir(path):
     """ Create directory specified by `path` if it doesn't already exist.
 

diff --git a/lineage/individual.py b/lineage/individual.py
@@ -163,19 +163,20 @@ def get_var_name(self):
         # http://stackoverflow.com/a/3305731
         return re.sub('\W|^(?=\d)', '_', self.name)
 
-    def remap_snps(self, source_assembly, target_assembly, complement_bases=True):
+    def remap_snps(self, target_assembly, complement_bases=True):
         """ Remap the SNP coordinates of this ``Individual`` from one assembly to another.
 
         This method uses the assembly map endpoint of the Ensembl REST API service (via this
         ``Individual``'s ``EnsemblRestClient``) to convert SNP coordinates / positions from one
         assembly to another. After remapping, the coordinates / positions for the ``Individual``'s
         SNPs will be that of the target assembly.
 
+        If the SNPs are already mapped relative to the target assembly, remapping will not be
+        performed.
+
         Parameters
         ----------
-        source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
-            starting assembly of an Individual's SNPs
-        target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
+        target_assembly : {'NCBI36', 'GRCh37', 'GRCh38', 36, 37, 38}
             assembly to remap to
         complement_bases : bool
             complement bases when remapping SNPs to the minus strand
@@ -202,16 +203,24 @@ def remap_snps(self, source_assembly, target_assembly, complement_bases=True):
             print('Need an ``EnsemblRestClient`` to remap SNPs')
             return
 
-        valid_assemblies = ['NCBI36', 'GRCh37', 'GRCh38']
+        valid_assemblies = ['NCBI36', 'GRCh37', 'GRCh38', 36, 37, 38]
 
-        if source_assembly not in valid_assemblies:
-            print('Invalid source assembly')
-            return
-        elif target_assembly not in valid_assemblies:
+        if target_assembly not in valid_assemblies:
             print('Invalid target assembly')
             return
-        elif str(self._assembly) not in source_assembly:
-            print('Current assembly of SNPs does not match specified source assembly')
+
+        if isinstance(target_assembly, int):
+            if target_assembly == 36:
+                target_assembly = 'NCBI36'
+            else:
+                target_assembly = 'GRCh' + str(target_assembly)
+
+        if self._assembly == 36:
+            source_assembly = 'NCBI36'
+        else:
+            source_assembly = 'GRCh' + str(self._assembly)
+
+        if source_assembly == target_assembly:
             return
 
         for chrom in self._snps['chrom'].unique():

diff --git a/lineage/resources.py b/lineage/resources.py
@@ -23,8 +23,6 @@
   http://dx.doi.org/10.1038/35057062
 ..[4] hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina
   Learned, Donna Karolchik and Robert Kuhn, https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19
-..[5] hg18 (NCBI36): Engineering effort led by Fan Hsu; QA effort led by Ann Zweig,
-  https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg18
 
 """
 
@@ -46,11 +44,9 @@
 
 """
 
-import ftplib
 import gzip
 import os
 import tarfile
-import tempfile
 import urllib.request
 import zlib
 
@@ -72,27 +68,11 @@ def __init__(self, resources_dir):
 
         """
         self._resources_dir = os.path.abspath(resources_dir)
-        self._hapmap_h36 = None
         self._hapmap_h37 = None
-        self._cytoband_h36 = None
         self._cytoband_h37 = None
         self._knownGene_h37 = None
         self._kgXref_h37 = None
 
-    def get_hapmap_h36(self):
-        """ Get International HapMap Consortium HapMap for Build 36.
-
-        Returns
-        -------
-        dict
-            dict of pandas.DataFrame HapMap tables if loading was successful, else None
-
-        """
-        if self._hapmap_h36 is None:
-            self._hapmap_h36 = self._load_hapmap(self._get_path_hapmap_h36())
-
-        return self._hapmap_h36
-
     def get_hapmap_h37(self):
         """ Get International HapMap Consortium HapMap for Build 37.
 
@@ -107,20 +87,6 @@ def get_hapmap_h37(self):
 
         return self._hapmap_h37
 
-    def get_cytoband_h36(self):
-        """ Get UCSC cytoBand table for Build 36.
-
-        Returns
-        -------
-        pandas.DataFrame
-            cytoBand table if loading was successful, else None
-
-        """
-        if self._cytoband_h36 is None:
-            self._cytoband_h36 = self._load_cytoband(self._get_path_cytoband_h36())
-
-        return self._cytoband_h36
-
     def get_cytoband_h37(self):
         """ Get UCSC cytoBand table for Build 37.
 
@@ -258,20 +224,9 @@ def _load_hapmap(filename):
         try:
             hapmap = {}
 
-            if '36' in filename:
+            if '37' in filename:
                 with tarfile.open(filename, 'r') as tar:
                     # http://stackoverflow.com/a/2018576
-                    for member in tar.getmembers():
-                        if 'genetic_map' in member.name:
-                            df = pd.read_csv(tar.extractfile(member), sep=' ')
-                            df = df.rename(columns={'position': 'pos',
-                                                    'COMBINED_rate(cM/Mb)': 'rate',
-                                                    'Genetic_Map(cM)': 'map'})
-                            start_pos = member.name.index('chr') + 3
-                            end_pos = member.name.index('_b36')
-                            hapmap[member.name[start_pos:end_pos]] = df
-            elif '37' in filename:
-                with tarfile.open(filename, 'r') as tar:
                     for member in tar.getmembers():
                         if 'genetic_map' in member.name:
                             df = pd.read_csv(tar.extractfile(member), sep='\t')
@@ -376,20 +331,6 @@ def _load_kgXref(filename):
             print(err)
             return None
 
-    def _get_path_cytoband_h36(self):
-        """ Get local path to cytoBand file for hg18 / NCBI36 from UCSC, downloading if necessary.
-
-        Returns
-        -------
-        str
-            path to cytoband_h36.txt.gz
-
-        """
-
-        return self._download_file(
-            'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/cytoBand.txt.gz',
-            'cytoband_h36.txt.gz')
-
     def _get_path_cytoband_h37(self):
         """ Get local path to cytoBand file for hg19 / GRCh37 from UCSC, downloading if necessary.
 
@@ -406,56 +347,6 @@ def _get_path_cytoband_h37(self):
     def _get_url_cytoband_h37():
         return 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz'
 
-    def _get_path_hapmap_h36(self):
-        """ Get local path to HapMap for hg18 / NCBI36, downloading if necessary.
-
-        Returns
-        -------
-        str
-            path to hapmap_h36.tar.gz
-
-        References
-        ----------
-        ..[1] "The International HapMap Consortium (2007).  A second generation human haplotype
-          map of over 3.1 million SNPs.  Nature 449: 851-861."
-
-        """
-        if not lineage.create_dir(self._resources_dir):
-            return None
-
-        hapmap = 'hapmap_h36'
-        destination = os.path.join(self._resources_dir, hapmap + '.tar.gz')
-
-        if not os.path.exists(destination):
-            try:
-                # make FTP connection to NCBI
-                with ftplib.FTP('ftp.ncbi.nlm.nih.gov') as ftp:
-                    ftp.login()
-                    ftp.cwd('hapmap/recombination/2008-03_rel22_B36/rates')
-
-                    # download each HapMap file and add to compressed tar
-                    with tarfile.open(destination, 'w:gz') as out_tar:
-                        for filename in ftp.nlst():
-                            if '.txt' in filename:
-                                path = os.path.join(destination, hapmap, filename)
-                                self._print_download_msg(path)
-
-                                # open temp file, download HapMap file, close temp file
-                                with tempfile.NamedTemporaryFile(delete=False) as fp:
-                                    ftp.retrbinary('RETR ' + filename, fp.write)
-
-                                # add temp file to archive
-                                out_tar.add(fp.name, arcname=os.path.join(hapmap, filename))
-
-                                # remove temp file
-                                os.remove(fp.name)
-                    ftp.quit()
-            except Exception as err:
-                print(err)
-                return None
-
-        return destination
-
     def _get_path_hapmap_h37(self):
         """ Get local path to HapMap for hg19 / GRCh37, downloading if necessary.
 
@@ -466,7 +357,9 @@ def _get_path_hapmap_h37(self):
 
         References
         ----------
-        ..[1] "The map was generated by lifting the HapMap Phase II genetic map from build 35 to
+        ..[1] "The International HapMap Consortium (2007).  A second generation human haplotype
+          map of over 3.1 million SNPs.  Nature 449: 851-861."
+        ..[2] "The map was generated by lifting the HapMap Phase II genetic map from build 35 to
           GRCh37. The original map was generated using LDhat as described in the 2007 HapMap
           paper (Nature, 18th Sept 2007). The conversion from b35 to GRCh37 was achieved using
           the UCSC liftOver tool. Adam Auton, 08/12/2010"

diff --git a/lineage/visualization.py b/lineage/visualization.py
@@ -77,7 +77,7 @@ def plot_chromosomes(one_chrom_match, two_chrom_match, cytobands, path, title, b
         path to destination `.png` file
     title : str
         title for plot
-    build : {36, 37}
+    build : {37}
         human genome assembly
 
     """