import csv
|
|
import itertools
|
|
import sys
|
|
|
|
PROBS = {
|
|
|
|
# Unconditional probabilities for having gene
|
|
"gene": {
|
|
2: 0.01,
|
|
1: 0.03,
|
|
0: 0.96
|
|
},
|
|
|
|
"trait": {
|
|
|
|
# Probability of trait given two copies of gene
|
|
2: {
|
|
True: 0.65,
|
|
False: 0.35
|
|
},
|
|
|
|
# Probability of trait given one copy of gene
|
|
1: {
|
|
True: 0.56,
|
|
False: 0.44
|
|
},
|
|
|
|
# Probability of trait given no gene
|
|
0: {
|
|
True: 0.01,
|
|
False: 0.99
|
|
}
|
|
},
|
|
|
|
# Mutation probability
|
|
"mutation": 0.01
|
|
}
|
|
|
|
|
|
def main():
|
|
|
|
# Check for proper usage
|
|
if len(sys.argv) != 2:
|
|
sys.exit("Usage: python heredity.py data.csv")
|
|
people = load_data(sys.argv[1])
|
|
|
|
# Keep track of gene and trait probabilities for each person
|
|
probabilities = {
|
|
person: {
|
|
"gene": {
|
|
2: 0,
|
|
1: 0,
|
|
0: 0
|
|
},
|
|
"trait": {
|
|
True: 0,
|
|
False: 0
|
|
}
|
|
}
|
|
for person in people
|
|
}
|
|
|
|
# Loop over all sets of people who might have the trait
|
|
names = set(people)
|
|
for have_trait in powerset(names):
|
|
|
|
# Check if current set of people violates known information
|
|
fails_evidence = any(
|
|
(people[person]["trait"] is not None and
|
|
people[person]["trait"] != (person in have_trait))
|
|
for person in names
|
|
)
|
|
if fails_evidence:
|
|
continue
|
|
|
|
# Loop over all sets of people who might have the gene
|
|
for one_gene in powerset(names):
|
|
for two_genes in powerset(names - one_gene):
|
|
|
|
# Update probabilities with new joint probability
|
|
p = joint_probability(people, one_gene, two_genes, have_trait)
|
|
update(probabilities, one_gene, two_genes, have_trait, p)
|
|
|
|
# Ensure probabilities sum to 1
|
|
normalize(probabilities)
|
|
|
|
# Print results
|
|
for person in people:
|
|
print(f"{person}:")
|
|
for field in probabilities[person]:
|
|
print(f" {field.capitalize()}:")
|
|
for value in probabilities[person][field]:
|
|
p = probabilities[person][field][value]
|
|
print(f" {value}: {p:.4f}")
|
|
|
|
|
|
def load_data(filename):
|
|
"""
|
|
Load gene and trait data from a file into a dictionary.
|
|
File assumed to be a CSV containing fields name, mother, father, trait.
|
|
mother, father must both be blank, or both be valid names in the CSV.
|
|
trait should be 0 or 1 if trait is known, blank otherwise.
|
|
"""
|
|
data = dict()
|
|
with open(filename) as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
name = row["name"]
|
|
data[name] = {
|
|
"name": name,
|
|
"mother": row["mother"] or None,
|
|
"father": row["father"] or None,
|
|
"trait": (True if row["trait"] == "1" else
|
|
False if row["trait"] == "0" else None)
|
|
}
|
|
return data
|
|
|
|
|
|
def powerset(s):
|
|
"""
|
|
Return a list of all possible subsets of set s.
|
|
"""
|
|
s = list(s)
|
|
return [
|
|
set(s) for s in itertools.chain.from_iterable(
|
|
itertools.combinations(s, r) for r in range(len(s) + 1)
|
|
)
|
|
]
|
|
|
|
def get_info(person, one_gene, two_genes, have_trait):
|
|
trait = person in have_trait
|
|
gene = 0
|
|
if person in one_gene:
|
|
gene = 1
|
|
elif person in two_genes:
|
|
gene = 2
|
|
return gene, trait
|
|
|
|
|
|
def joint_probability(people, one_gene, two_genes, have_trait):
|
|
"""
|
|
Compute and return a joint probability.
|
|
|
|
The probability returned should be the probability that
|
|
* everyone in set `one_gene` has one copy of the gene, and
|
|
* everyone in set `two_genes` has two copies of the gene, and
|
|
* everyone not in `one_gene` or `two_gene` does not have the gene, and
|
|
* everyone in set `have_trait` has the trait, and
|
|
* everyone not in set` have_trait` does not have the trait.
|
|
"""
|
|
|
|
def generate_prob(m_gene, f_gene, gene_combination):
|
|
if m_gene == 1:
|
|
m_prob = 0.5
|
|
else:
|
|
m_prob = 0.99 if m_gene/2 == gene_combination[0] else 0.01
|
|
|
|
if f_gene == 1:
|
|
f_prob = 0.5
|
|
else:
|
|
f_prob = 0.99 if f_gene/2 == gene_combination[1] else 0.01
|
|
return m_prob * f_prob
|
|
|
|
probabilities = []
|
|
|
|
for person in people:
|
|
gene, trait = get_info(person, one_gene, two_genes, have_trait)
|
|
if people[person]["mother"] and people[person]["father"]:
|
|
mother_gene, foo = get_info(people[person]["mother"], one_gene, two_genes, have_trait)
|
|
father_gene, foo = get_info(people[person]["father"], one_gene, two_genes, have_trait)
|
|
if gene == 1:
|
|
gene_prob = generate_prob(mother_gene, father_gene, (0, 1)) + generate_prob(mother_gene, father_gene, (1, 0))
|
|
else:
|
|
gene_prob = generate_prob(mother_gene, father_gene, (gene/2, gene/2))
|
|
else:
|
|
gene_prob = PROBS["gene"][gene]
|
|
probabilities.append(gene_prob * PROBS["trait"][gene][trait])
|
|
|
|
joint_prob = 1
|
|
for p in probabilities:
|
|
joint_prob *= p
|
|
|
|
return joint_prob
|
|
|
|
|
|
def update(probabilities, one_gene, two_genes, have_trait, p):
|
|
for person in probabilities:
|
|
gene, trait = get_info(person, one_gene, two_genes, have_trait)
|
|
probabilities[person]["gene"][gene] += p
|
|
probabilities[person]["trait"][trait] += p
|
|
|
|
|
|
def normalize(probabilities):
|
|
for person in probabilities:
|
|
psum = 0
|
|
for gene in probabilities[person]["gene"]:
|
|
psum += probabilities[person]["gene"][gene]
|
|
gene_ratio = 1/psum
|
|
for gene in probabilities[person]["gene"]:
|
|
probabilities[person]["gene"][gene] *= gene_ratio
|
|
|
|
psum = 0
|
|
for trait in probabilities[person]["trait"]:
|
|
psum += probabilities[person]["trait"][trait]
|
|
trait_ratio = 1/psum
|
|
for trait in probabilities[person]["trait"]:
|
|
probabilities[person]["trait"][trait] *= trait_ratio
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|