My solutions to Harvard's online course CS50AI, An Introduction to Machine Learning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

104 lines
3.1 KiB

4 years ago
  1. import os
  2. import random
  3. import re
  4. import sys
  5. from numpy.random import choice
  6. import matplotlib
  7. DAMPING = 0.85
  8. SAMPLES = 10000
  9. def main():
  10. if len(sys.argv) != 2:
  11. sys.exit("Usage: python pagerank.py corpus")
  12. corpus = crawl(sys.argv[1])
  13. ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
  14. print(f"PageRank Results from Sampling (n = {SAMPLES})")
  15. for page in sorted(ranks):
  16. print(f" {page}: {ranks[page]:.4f}")
  17. ranks = iterate_pagerank(corpus, DAMPING)
  18. print(f"PageRank Results from Iteration")
  19. for page in sorted(ranks):
  20. print(f" {page}: {ranks[page]:.4f}")
  21. def crawl(directory):
  22. """
  23. Parse a directory of HTML pages and check for links to other pages.
  24. Return a dictionary where each key is a page, and values are
  25. a list of all other pages in the corpus that are linked to by the page.
  26. """
  27. pages = dict()
  28. # Extract all links from HTML files
  29. for filename in os.listdir(directory):
  30. if not filename.endswith(".html"):
  31. continue
  32. with open(os.path.join(directory, filename)) as f:
  33. contents = f.read()
  34. links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
  35. pages[filename] = set(links) - {filename}
  36. # Only include links to other pages in the corpus
  37. for filename in pages:
  38. pages[filename] = set(
  39. link for link in pages[filename]
  40. if link in pages
  41. )
  42. return pages
  43. def transition_model(corpus, page, damping_factor):
  44. if corpus[page] == set():
  45. return {x: 1/len(corpus) for x in corpus}
  46. damping_probability = (1-damping_factor)/len(corpus)
  47. probabilities = {x: damping_probability for x in corpus}
  48. for p in corpus[page]:
  49. if p != page:
  50. probabilities[p] += damping_factor/corpus[page].__len__()
  51. return probabilities
  52. def sample_pagerank(corpus, damping_factor, n):
  53. counter = {x: 0 for x in corpus}
  54. pages = list(corpus.keys())
  55. page = random.choice(pages)
  56. for r in range(n):
  57. counter[page] += 1
  58. model = transition_model(corpus, page, damping_factor)
  59. prob = tuple(model[x] for x in pages)
  60. page = pages[choice(len(pages), p=prob)]
  61. pageranks = {p: counter[p]/n for p in pages}
  62. return pageranks
  63. def iterate_pagerank(corpus, damping_factor):
  64. epsilon = 0.001
  65. pageranks = {x: 1/len(corpus) for x in corpus}
  66. linkedby = {}
  67. damping_probability = (1-damping_factor)/len(corpus)
  68. for i in corpus:
  69. linkedby[i] = set()
  70. for j in corpus:
  71. if j != i and i in corpus[j]:
  72. linkedby[i].add(j)
  73. change = 1
  74. while change > epsilon:
  75. pageranks_cp = pageranks.copy()
  76. for page in corpus:
  77. pageranks[page] = damping_probability
  78. for i in linkedby[page]:
  79. pageranks[page] += damping_factor * pageranks_cp[i]/corpus[i].__len__()
  80. if abs(pageranks[page] - pageranks_cp[page]) < change:
  81. change = abs(pageranks[page] - pageranks_cp[page])
  82. return pageranks
  83. if __name__ == "__main__":
  84. main()