My solutions to Harvard's online course CS50AI, An Introduction to Machine Learning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

104 lines
3.1 KiB

import os
import random
import re
import sys
from numpy.random import choice
import matplotlib
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
if corpus[page] == set():
return {x: 1/len(corpus) for x in corpus}
damping_probability = (1-damping_factor)/len(corpus)
probabilities = {x: damping_probability for x in corpus}
for p in corpus[page]:
if p != page:
probabilities[p] += damping_factor/corpus[page].__len__()
return probabilities
def sample_pagerank(corpus, damping_factor, n):
counter = {x: 0 for x in corpus}
pages = list(corpus.keys())
page = random.choice(pages)
for r in range(n):
counter[page] += 1
model = transition_model(corpus, page, damping_factor)
prob = tuple(model[x] for x in pages)
page = pages[choice(len(pages), p=prob)]
pageranks = {p: counter[p]/n for p in pages}
return pageranks
def iterate_pagerank(corpus, damping_factor):
epsilon = 0.001
pageranks = {x: 1/len(corpus) for x in corpus}
linkedby = {}
damping_probability = (1-damping_factor)/len(corpus)
for i in corpus:
linkedby[i] = set()
for j in corpus:
if j != i and i in corpus[j]:
linkedby[i].add(j)
change = 1
while change > epsilon:
pageranks_cp = pageranks.copy()
for page in corpus:
pageranks[page] = damping_probability
for i in linkedby[page]:
pageranks[page] += damping_factor * pageranks_cp[i]/corpus[i].__len__()
if abs(pageranks[page] - pageranks_cp[page]) < change:
change = abs(pageranks[page] - pageranks_cp[page])
return pageranks
if __name__ == "__main__":
main()