|
|
- import csv
- import itertools
- import sys
-
- PROBS = {
-
- # Unconditional probabilities for having gene
- "gene": {
- 2: 0.01,
- 1: 0.03,
- 0: 0.96
- },
-
- "trait": {
-
- # Probability of trait given two copies of gene
- 2: {
- True: 0.65,
- False: 0.35
- },
-
- # Probability of trait given one copy of gene
- 1: {
- True: 0.56,
- False: 0.44
- },
-
- # Probability of trait given no gene
- 0: {
- True: 0.01,
- False: 0.99
- }
- },
-
- # Mutation probability
- "mutation": 0.01
- }
-
-
- def main():
-
- # Check for proper usage
- if len(sys.argv) != 2:
- sys.exit("Usage: python heredity.py data.csv")
- people = load_data(sys.argv[1])
-
- # Keep track of gene and trait probabilities for each person
- probabilities = {
- person: {
- "gene": {
- 2: 0,
- 1: 0,
- 0: 0
- },
- "trait": {
- True: 0,
- False: 0
- }
- }
- for person in people
- }
-
- # Loop over all sets of people who might have the trait
- names = set(people)
- for have_trait in powerset(names):
-
- # Check if current set of people violates known information
- fails_evidence = any(
- (people[person]["trait"] is not None and
- people[person]["trait"] != (person in have_trait))
- for person in names
- )
- if fails_evidence:
- continue
-
- # Loop over all sets of people who might have the gene
- for one_gene in powerset(names):
- for two_genes in powerset(names - one_gene):
-
- # Update probabilities with new joint probability
- p = joint_probability(people, one_gene, two_genes, have_trait)
- update(probabilities, one_gene, two_genes, have_trait, p)
-
- # Ensure probabilities sum to 1
- normalize(probabilities)
-
- # Print results
- for person in people:
- print(f"{person}:")
- for field in probabilities[person]:
- print(f" {field.capitalize()}:")
- for value in probabilities[person][field]:
- p = probabilities[person][field][value]
- print(f" {value}: {p:.4f}")
-
-
- def load_data(filename):
- """
- Load gene and trait data from a file into a dictionary.
- File assumed to be a CSV containing fields name, mother, father, trait.
- mother, father must both be blank, or both be valid names in the CSV.
- trait should be 0 or 1 if trait is known, blank otherwise.
- """
- data = dict()
- with open(filename) as f:
- reader = csv.DictReader(f)
- for row in reader:
- name = row["name"]
- data[name] = {
- "name": name,
- "mother": row["mother"] or None,
- "father": row["father"] or None,
- "trait": (True if row["trait"] == "1" else
- False if row["trait"] == "0" else None)
- }
- return data
-
-
- def powerset(s):
- """
- Return a list of all possible subsets of set s.
- """
- s = list(s)
- return [
- set(s) for s in itertools.chain.from_iterable(
- itertools.combinations(s, r) for r in range(len(s) + 1)
- )
- ]
-
- def get_info(person, one_gene, two_genes, have_trait):
- trait = person in have_trait
- gene = 0
- if person in one_gene:
- gene = 1
- elif person in two_genes:
- gene = 2
- return gene, trait
-
-
- def joint_probability(people, one_gene, two_genes, have_trait):
- """
- Compute and return a joint probability.
-
- The probability returned should be the probability that
- * everyone in set `one_gene` has one copy of the gene, and
- * everyone in set `two_genes` has two copies of the gene, and
- * everyone not in `one_gene` or `two_gene` does not have the gene, and
- * everyone in set `have_trait` has the trait, and
- * everyone not in set` have_trait` does not have the trait.
- """
-
- def generate_prob(m_gene, f_gene, gene_combination):
- if m_gene == 1:
- m_prob = 0.5
- else:
- m_prob = 0.99 if m_gene/2 == gene_combination[0] else 0.01
-
- if f_gene == 1:
- f_prob = 0.5
- else:
- f_prob = 0.99 if f_gene/2 == gene_combination[1] else 0.01
- return m_prob * f_prob
-
- probabilities = []
-
- for person in people:
- gene, trait = get_info(person, one_gene, two_genes, have_trait)
- if people[person]["mother"] and people[person]["father"]:
- mother_gene, foo = get_info(people[person]["mother"], one_gene, two_genes, have_trait)
- father_gene, foo = get_info(people[person]["father"], one_gene, two_genes, have_trait)
- if gene == 1:
- gene_prob = generate_prob(mother_gene, father_gene, (0, 1)) + generate_prob(mother_gene, father_gene, (1, 0))
- else:
- gene_prob = generate_prob(mother_gene, father_gene, (gene/2, gene/2))
- else:
- gene_prob = PROBS["gene"][gene]
- probabilities.append(gene_prob * PROBS["trait"][gene][trait])
-
- joint_prob = 1
- for p in probabilities:
- joint_prob *= p
-
- return joint_prob
-
-
- def update(probabilities, one_gene, two_genes, have_trait, p):
- for person in probabilities:
- gene, trait = get_info(person, one_gene, two_genes, have_trait)
- probabilities[person]["gene"][gene] += p
- probabilities[person]["trait"][trait] += p
-
-
- def normalize(probabilities):
- for person in probabilities:
- psum = 0
- for gene in probabilities[person]["gene"]:
- psum += probabilities[person]["gene"][gene]
- gene_ratio = 1/psum
- for gene in probabilities[person]["gene"]:
- probabilities[person]["gene"][gene] *= gene_ratio
-
- psum = 0
- for trait in probabilities[person]["trait"]:
- psum += probabilities[person]["trait"][trait]
- trait_ratio = 1/psum
- for trait in probabilities[person]["trait"]:
- probabilities[person]["trait"][trait] *= trait_ratio
-
-
- if __name__ == "__main__":
- main()
-
|