|
|
- import os
- import random
- import re
- import sys
- from numpy.random import choice
- import matplotlib
-
- DAMPING = 0.85
- SAMPLES = 10000
-
-
- def main():
- if len(sys.argv) != 2:
- sys.exit("Usage: python pagerank.py corpus")
- corpus = crawl(sys.argv[1])
- ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
- print(f"PageRank Results from Sampling (n = {SAMPLES})")
- for page in sorted(ranks):
- print(f" {page}: {ranks[page]:.4f}")
- ranks = iterate_pagerank(corpus, DAMPING)
- print(f"PageRank Results from Iteration")
- for page in sorted(ranks):
- print(f" {page}: {ranks[page]:.4f}")
-
-
- def crawl(directory):
- """
- Parse a directory of HTML pages and check for links to other pages.
- Return a dictionary where each key is a page, and values are
- a list of all other pages in the corpus that are linked to by the page.
- """
- pages = dict()
-
- # Extract all links from HTML files
- for filename in os.listdir(directory):
- if not filename.endswith(".html"):
- continue
- with open(os.path.join(directory, filename)) as f:
- contents = f.read()
- links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
- pages[filename] = set(links) - {filename}
-
- # Only include links to other pages in the corpus
- for filename in pages:
- pages[filename] = set(
- link for link in pages[filename]
- if link in pages
- )
-
- return pages
-
-
- def transition_model(corpus, page, damping_factor):
- if corpus[page] == set():
- return {x: 1/len(corpus) for x in corpus}
- damping_probability = (1-damping_factor)/len(corpus)
- probabilities = {x: damping_probability for x in corpus}
- for p in corpus[page]:
- if p != page:
- probabilities[p] += damping_factor/corpus[page].__len__()
- return probabilities
-
-
- def sample_pagerank(corpus, damping_factor, n):
- counter = {x: 0 for x in corpus}
- pages = list(corpus.keys())
- page = random.choice(pages)
- for r in range(n):
- counter[page] += 1
- model = transition_model(corpus, page, damping_factor)
- prob = tuple(model[x] for x in pages)
- page = pages[choice(len(pages), p=prob)]
-
- pageranks = {p: counter[p]/n for p in pages}
- return pageranks
-
-
- def iterate_pagerank(corpus, damping_factor):
- epsilon = 0.001
- pageranks = {x: 1/len(corpus) for x in corpus}
- linkedby = {}
- damping_probability = (1-damping_factor)/len(corpus)
-
- for i in corpus:
- linkedby[i] = set()
- for j in corpus:
- if j != i and i in corpus[j]:
- linkedby[i].add(j)
-
- change = 1
- while change > epsilon:
- pageranks_cp = pageranks.copy()
- for page in corpus:
- pageranks[page] = damping_probability
- for i in linkedby[page]:
- pageranks[page] += damping_factor * pageranks_cp[i]/corpus[i].__len__()
- if abs(pageranks[page] - pageranks_cp[page]) < change:
- change = abs(pageranks[page] - pageranks_cp[page])
-
- return pageranks
-
-
- if __name__ == "__main__":
- main()
|