From c600a4850a16d91c76484b20644b2f10edf69053 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Wed, 14 Apr 2021 20:32:08 -0700 Subject: [PATCH 1/2] Update documentation --- src/lineage/resources.py | 27 +++++++++++++-------------- src/lineage/visualization.py | 4 ++-- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/lineage/resources.py b/src/lineage/resources.py index c624120..60fb7ef 100644 --- a/src/lineage/resources.py +++ b/src/lineage/resources.py @@ -133,6 +133,7 @@ def get_genetic_map(self, genetic_map): def get_genetic_map_HapMapII_GRCh37(self): """ Get International HapMap Consortium HapMap Phase II genetic map for Build 37. + [#InternationalHapMapConsortium]_ [#Auton2010]_ Returns ------- @@ -141,10 +142,10 @@ def get_genetic_map_HapMapII_GRCh37(self): References ---------- - 1. "The International HapMap Consortium (2007). A second generation human haplotype - map of over 3.1 million SNPs. Nature 449: 851-861." - 2. "The map was generated by lifting the HapMap Phase II genetic map from build 35 to - GRCh37. The original map was generated using LDhat as described in the 2007 HapMap + .. [#InternationalHapMapConsortium] "The International HapMap Consortium (2007). A second generation human + haplotype map of over 3.1 million SNPs. Nature 449: 851-861." + .. [#Auton2010] "The map was generated by lifting the HapMap Phase II genetic map from build + 35 to GRCh37. The original map was generated using LDhat as described in the 2007 HapMap paper (Nature, 18th Sept 2007). The conversion from b35 to GRCh37 was achieved using the UCSC liftOver tool. Adam Auton, 08/12/2010" """ @@ -157,11 +158,12 @@ def get_genetic_map_HapMapII_GRCh37(self): return self._genetic_map def get_genetic_map_1000G_GRCh37(self, pop): - """ Get population-specific 1000 Genomes Project genetic map. + """ Get population-specific 1000 Genomes Project genetic map. [#Auton2013]_ [#Auton2015]_ Notes ----- - From `README_omni_recombination_20130507 `_: + From `README_omni_recombination_20130507 + `_ [#Auton2013]_ : Genetic maps generated from the 1000G phased OMNI data. @@ -185,9 +187,6 @@ def get_genetic_map_1000G_GRCh37(self, pop): estimate of 4Ne, allowing the population based rates to be converted to per-generation rates. - Adam Auton (adam.auton@einstein.yu.edu) - May 7th, 2013 - Returns ------- dict @@ -196,11 +195,11 @@ def get_genetic_map_1000G_GRCh37(self, pop): References ---------- - 1. Adam Auton (adam.auton@einstein.yu.edu), May 7th, 2013 - 2. The 1000 Genomes Project Consortium., Corresponding authors., Auton, A. et al. A - global reference for human genetic variation. Nature 526, 68–74 (2015). + .. [#Auton2013] Adam Auton, May 7th, 2013 + .. [#Auton2015] The 1000 Genomes Project Consortium., Corresponding authors., + Auton, A. et al. A global reference for human genetic variation. Nature 526, 68–74 (2015). https://doi.org/10.1038/nature15393 - 3. https://github.com/joepickrell/1000-genomes-genetic-maps + .. [#Pickrell] https://github.com/joepickrell/1000-genomes-genetic-maps """ if self._genetic_map_name != pop: self._genetic_map = self._load_genetic_map_1000G_GRCh37( @@ -415,7 +414,7 @@ def _load_cytoBand(filename): df : pandas.DataFrame cytoBand table """ - # adapted from chromosome plotting code (see [1]_) + # adapted from chromosome plotting code (see get_cytoBand_hg19 Ref. 1.) df = pd.read_csv( filename, names=["chrom", "start", "end", "name", "gie_stain"], sep="\t" ) diff --git a/src/lineage/visualization.py b/src/lineage/visualization.py index 1b43758..a62d118 100644 --- a/src/lineage/visualization.py +++ b/src/lineage/visualization.py @@ -2,11 +2,11 @@ Notes ----- -Adapted from Ryan Dale's GitHub Gist for plotting chromosome features. [1]_ +Adapted from Ryan Dale's GitHub Gist for plotting chromosome features. [#Dale]_ References ---------- -.. [1] Ryan Dale, GitHub Gist, +.. [#Dale] Ryan Dale, GitHub Gist, https://gist.github.com/daler/c98fc410282d7570efc3#file-ideograms-py """ From 67edd8eca7f93d543c9900315cf73d00c4a407e0 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Wed, 14 Apr 2021 23:25:28 -0700 Subject: [PATCH 2/2] Document algorithms --- README.rst | 12 +++---- src/lineage/__init__.py | 72 +++++++++++++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 16 deletions(-) diff --git a/README.rst b/README.rst index 1b791a6..aef7fce 100644 --- a/README.rst +++ b/README.rst @@ -117,8 +117,8 @@ Loading SNPs('resources/663.23andme.305.txt.gz') Now we can perform some analysis between the ``User662`` and ``User663`` datasets. -Find Discordant SNPs -'''''''''''''''''''' +`Find Discordant SNPs `_ +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' First, let's find discordant SNPs (i.e., SNP data that is not consistent with Mendelian inheritance): @@ -136,8 +136,8 @@ the prompt, although the same output is available in the CSV file. Not counting mtDNA SNPs, there are 37 discordant SNPs between these two datasets. -Find Shared DNA -''''''''''''''' +`Find Shared DNA `_ +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' ``lineage`` uses the probabilistic recombination rates throughout the human genome from the `International HapMap Project `_ and the `1000 Genomes Project `_ to compute the shared DNA @@ -180,8 +180,8 @@ details the shared segments of DNA on one chromosome and a plot that illustrates .. image:: https://raw.githubusercontent.com/apriha/lineage/master/docs/images/shared_dna_User662_User663_HapMap2.png -Find Shared Genes -''''''''''''''''' +`Find Shared Genes `_ +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' The `Central Dogma of Molecular Biology `_ states that genetic information flows from DNA to mRNA to proteins: DNA is transcribed into mRNA, and mRNA is translated into a protein. It's more complicated than this (it's biology diff --git a/src/lineage/__init__.py b/src/lineage/__init__.py index 9ca6d78..cca4d8f 100644 --- a/src/lineage/__init__.py +++ b/src/lineage/__init__.py @@ -290,6 +290,36 @@ def find_shared_dna( All output is saved to the output directory as `CSV` or `PNG` files. + Notes + ----- + The code is commented throughout to help describe the algorithm and its operation. + + To summarize, the algorithm first computes the genetic distance in cMs between SNPs + common to all individuals using the specified genetic map. + + Then, individuals are compared for whether they share one or two alleles for each SNP in + common; in this manner, where all individuals share one chromosome, for example, there + will be several SNPs in a row where at least one allele is shared between individuals for + each SNP. The ``cM_threshold`` is then applied to each of these "matching segments" to + determine whether the segment could be a potential shared DNA segment (i.e., whether each + segment has a cM value greater than the threshold). + + The matching segments that passed the ``cM_threshold`` are then checked to see if they + are adjacent to another matching segment, and if so, the segments are stitched together, + and the single SNP separating the segments is flagged as potentially discrepant. (This + means that multiple smaller matching segments passing the ``cM_threshold`` could be + stitched, identifying the SNP between each segment as discrepant.) + + Next, the ``snp_threshold`` is applied to each segment to ensure there are enough SNPs in + the segment and the segment is not only a few SNPs in a region with a high recombination + rate; for each segment that passes this test, we have a segment of shared DNA, and the + total cMs for this segment are computed. + + Finally, discrepant SNPs are checked to ensure that only SNPs internal to a shared DNA + segment are reported as discrepant (i.e., don't report a discrepant SNP if it was part of a + segment that didn't pass the ``snp_threshold``). Currently, no action other than reporting + is taken on discrepant SNPs. + Parameters ---------- individuals : iterable of Individuals @@ -330,6 +360,7 @@ def find_shared_dna( two_chrom_discrepant_snps (pandas.Index) discrepant SNPs discovered while finding shared DNA on two chromosomes """ + # initialize all objects to be returned to be empty to start one_chrom_shared_dna = pd.DataFrame() two_chrom_shared_dna = pd.DataFrame() one_chrom_shared_genes = pd.DataFrame() @@ -337,8 +368,10 @@ def find_shared_dna( one_chrom_discrepant_snps = pd.Index([]) two_chrom_discrepant_snps = pd.Index([]) + # ensure that all individuals have SNPs that are mapped relative to Build 37 self._remap_snps_to_GRCh37(individuals) + # return if there aren't enough individuals to compare if len(individuals) < 2: logger.warning("find_shared_dna requires two or more individuals...") return self._find_shared_dna_return_helper( @@ -350,6 +383,7 @@ def find_shared_dna( two_chrom_discrepant_snps, ) + # load the specified genetic map (one genetic map for each chromosome) genetic_map_dfs = self._resources.get_genetic_map(genetic_map) if len(genetic_map_dfs) == 0: @@ -362,26 +396,34 @@ def find_shared_dna( two_chrom_discrepant_snps, ) - cols = ["genotype{}".format(str(i)) for i in range(len(individuals))] + # generate a list of dynamically named columns for each individual's genotype + # (e.g., genotype0, genotype1, etc). + cols = [f"genotype{str(i)}" for i in range(len(individuals))] + # set the reference SNPs to compare to be that of the first individual df = individuals[0].snps df = df.rename(columns={"genotype": cols[0]}) + # build-up a dataframe of SNPs that are common to all individuals for i, ind in enumerate(individuals[1:]): # join SNPs for all individuals df = df.join(ind.snps["genotype"], how="inner") df = df.rename(columns={"genotype": cols[i + 1]}) + # set a flag for if one individuals is male (i.e., only one chromosome match on the X + # chromosome is possible in the non-PAR region) one_x_chrom = self._is_one_individual_male(individuals) + # create tasks to compute the genetic distances (cMs) between each SNP on each chromosome tasks = [] - chroms_to_drop = [] for chrom in df["chrom"].unique(): if chrom not in genetic_map_dfs.keys(): chroms_to_drop.append(chrom) continue + # each task requires the genetic map for the chromosome and the positions of all SNPs + # in common on that chromosome tasks.append( { "genetic_map": genetic_map_dfs[chrom], @@ -390,19 +432,24 @@ def find_shared_dna( } ) - # drop chromosomes without genetic distance data + # drop chromosomes without genetic distance data (e.g., chroms MT, PAR, etc.) for chrom in chroms_to_drop: df = df.drop(df.loc[df["chrom"] == chrom].index) - # determine the genetic distance between each SNP using the HapMap Phase II genetic map + # determine the genetic distance between each SNP using the specified genetic map snp_distances = map(self._compute_snp_distances, tasks) snp_distances = pd.concat(snp_distances) + + # extract the column "cM_from_prev_snp" from the result and add that to the dataframe + # of SNPs common to all individuals; now we have the genetic distance between each SNP df["cM_from_prev_snp"] = snp_distances["cM_from_prev_snp"] + # now we apply a mask for whether all individuals match on one or two chromosomes... + # first, set all rows for these columns to True df["one_chrom_match"] = True df["two_chrom_match"] = True - - # determine where individuals share an allele on one chromosome + # determine where individuals share an allele on one chromosome (i.e., set to False when + # at least one allele doesn't match for all individuals) for genotype1, genotype2 in combinations(cols, 2): df.loc[ ~df[genotype1].isnull() @@ -414,7 +461,8 @@ def find_shared_dna( "one_chrom_match", ] = False - # determine where individuals share alleles on both chromosomes + # determine where individuals share alleles on two chromosomes (i.e., set to False when + # two alleles don't match for all individuals) for genotype1, genotype2 in combinations(cols, 2): df.loc[ ~df[genotype1].isnull() @@ -430,12 +478,14 @@ def find_shared_dna( # genotype columns are no longer required for calculation df = df.drop(cols, axis=1) + # find shared DNA on one chrom one_chrom_shared_dna, one_chrom_discrepant_snps = self._find_shared_dna_helper( df[["chrom", "pos", "cM_from_prev_snp", "one_chrom_match"]], cM_threshold, snp_threshold, one_x_chrom, ) + # find shared DNA on two chroms two_chrom_shared_dna, two_chrom_discrepant_snps = self._find_shared_dna_helper( df[["chrom", "pos", "cM_from_prev_snp", "two_chrom_match"]], cM_threshold, @@ -646,7 +696,7 @@ def _is_one_individual_male(self, individuals): return False def _compute_snp_distances(self, task): - """ Compute genetic distance between SNPs. + """ Compute genetic distance in cMs between SNPs. Parameters ---------- @@ -732,8 +782,8 @@ def _compute_shared_dna(self, task): "two_chrom_match", ] = False - # get consecutive strings of trues - # http://stackoverflow.com/a/17151327 + # get consecutive strings of Trues, for where there's a one or two chrom match between + # individuals, depending on the task; http://stackoverflow.com/a/17151327 a = df.loc[(df["chrom"] == chrom)][match_col].values a = np.r_[a, False] a_rshifted = np.roll(a, 1) @@ -744,8 +794,10 @@ def _compute_shared_dna(self, task): a_ends = np.nonzero(ends)[0] a_ends = np.reshape(a_ends, (len(a_ends), 1)) + # get the matching segments matches = np.hstack((a_starts, a_ends)) + # compute total cMs for each matching segment c = np.r_[0, df.loc[(df["chrom"] == chrom)]["cM_from_prev_snp"].cumsum()][ matches ]