heredity.py

import csv
import itertools
import sys

PROBS = {
    # Unconditional probabilities for having gene
    "gene": {2: 0.01, 1: 0.03, 0: 0.96},
    "trait": {
        # Probability of trait given two copies of gene
        2: {True: 0.65, False: 0.35},
        # Probability of trait given one copy of gene
        1: {True: 0.56, False: 0.44},
        # Probability of trait given no gene
        0: {True: 0.01, False: 0.99},
    },
    # Mutation probability
    "mutation": 0.01,
}


def main():

    # Check for proper usage
    if len(sys.argv) != 2:
        sys.exit("Usage: python heredity.py data.csv")
    people = load_data(sys.argv[1])

    # Keep track of gene and trait probabilities for each person
    probabilities = {
        person: {"gene": {2: 0, 1: 0, 0: 0}, "trait": {True: 0, False: 0}}
        for person in people
    }

    # Loop over all sets of people who might have the trait
    names = set(people)
    for have_trait in powerset(names):

        # Check if current set of people violates known information
        fails_evidence = any(
            (
                people[person]["trait"] is not None
                and people[person]["trait"] != (person in have_trait)
            )
            for person in names
        )
        if fails_evidence:
            continue

        # Loop over all sets of people who might have the gene
        for one_gene in powerset(names):
            for two_genes in powerset(names - one_gene):

                # Update probabilities with new joint probability
                p = joint_probability(people, one_gene, two_genes, have_trait)
                update(probabilities, one_gene, two_genes, have_trait, p)

    # Ensure probabilities sum to 1
    normalize(probabilities)

    # Print results
    for person in people:
        print(f"{person}:")
        for field in probabilities[person]:
            print(f"  {field.capitalize()}:")
            for value in probabilities[person][field]:
                p = probabilities[person][field][value]
                print(f"    {value}: {p:.4f}")


def load_data(filename):
    """
    Load gene and trait data from a file into a dictionary.
    File assumed to be a CSV containing fields name, mother, father, trait.
    mother, father must both be blank, or both be valid names in the CSV.
    trait should be 0 or 1 if trait is known, blank otherwise.
    """
    data = dict()
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = row["name"]
            data[name] = {
                "name": name,
                "mother": row["mother"] or None,
                "father": row["father"] or None,
                "trait": (
                    True
                    if row["trait"] == "1"
                    else False
                    if row["trait"] == "0"
                    else None
                ),
            }
    return data


def powerset(s):
    """
    Return a list of all possible subsets of set s.
    """
    s = list(s)
    return [
        set(s)
        for s in itertools.chain.from_iterable(
            itertools.combinations(s, r) for r in range(len(s) + 1)
        )
    ]


def joint_probability(people, one_gene, two_genes, have_trait):
    """
    Compute and return a joint probability.

    The probability returned should be the probability that
        * everyone in set `one_gene` has one copy of the gene, and
        * everyone in set `two_genes` has two copies of the gene, and
        * everyone not in `one_gene` or `two_gene` does not have the gene, and
        * everyone in set `have_trait` has the trait, and
        * everyone not in set` have_trait` does not have the trait.
    """

    probability: float = 1.0

    for person_info in people.values():
        person_name: str = person_info["name"]
        gene_copies: int = (
            2 if person_name in two_genes else 1 if person_name in one_gene else 0
        )

        # Factor in trait probability
        probability *= PROBS["trait"][gene_copies][person_name in have_trait]

        # Determine gene number probability based on parent info
        mom: str = person_info["mother"]
        dad: str = person_info["father"]
        if mom is None and dad is None:
            probability *= PROBS["gene"][gene_copies]
        else:
            m: float = PROBS["mutation"]
            inheritance_prob: float = (
                lambda parent: (1.0 - m)
                if parent in two_genes
                else 0.5  # 0.5 for one_gene because 0.5 * (1.0 - m) + 0.5 * (0.0 + m) = 0.5
                if parent in one_gene
                else (0.0 + m)
            )

            from_mom: float = inheritance_prob(mom)
            from_dad: float = inheritance_prob(dad)
            not_from_mom: float = 1.0 - from_mom
            not_from_dad: float = 1.0 - from_dad

            # Given from mom OR dad
            if person_name in one_gene:
                probability *= not_from_mom * from_dad + from_mom * not_from_dad
            # Given from mom AND dad
            elif person_name in two_genes:
                probability *= from_mom * from_dad
            # Not given gene
            else:
                probability *= not_from_mom * not_from_dad

    return probability


def update(probabilities, one_gene, two_genes, have_trait, p):
    """
    Add to `probabilities` a new joint probability `p`.
    Each person should have their "gene" and "trait" distributions updated.
    Which value for each distribution is updated depends on whether
    the person is in `have_gene` and `have_trait`, respectively.
    """

    for person in probabilities:
        gene_copies: int = 2 if person in two_genes else 1 if person in one_gene else 0
        probabilities[person]["gene"][gene_copies] += p
        probabilities[person]["trait"][person in have_trait] += p


def normalize(probabilities):
    """
    Update `probabilities` such that each probability distribution
    is normalized (i.e., sums to 1, with relative proportions the same).
    """

    for probability in probabilities.values():
        gene_probs = probability["gene"]
        trait_probs = probability["trait"]

        a = 1 / sum(gene_probs.values())
        gene_probs.update(
            {
                gene_copies: gene_prob * a
                for gene_copies, gene_prob in gene_probs.items()
            }
        )

        a = 1 / sum(trait_probs.values())
        trait_probs.update(
            {
                have_trait: trait_prob * a
                for have_trait, trait_prob in trait_probs.items()
            }
        )


if __name__ == "__main__":
    main()