My solutions to Harvard's online course CS50AI, An Introduction to Machine Learning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

159 lines
4.5 KiB

import nltk
import sys
import os, string, math
FILE_MATCHES = 1
SENTENCE_MATCHES = 1
def main():
# Check command-line arguments
if len(sys.argv) != 2:
sys.exit("Usage: python questions.py corpus")
# Calculate IDF values across files
files = load_files(sys.argv[1])
file_words = {
filename: tokenize(files[filename])
for filename in files
}
file_idfs = compute_idfs(file_words)
# Prompt user for query
query = set(tokenize(input("Query: ")))
# Determine top file matches according to TF-IDF
filenames = top_files(query, file_words, file_idfs, n=FILE_MATCHES)
# Extract sentences from top files
sentences = dict()
for filename in filenames:
for passage in files[filename].split("\n"):
for sentence in nltk.sent_tokenize(passage):
tokens = tokenize(sentence)
if tokens:
sentences[sentence] = tokens
# Compute IDF values across sentences
idfs = compute_idfs(sentences)
# Determine top sentence matches
matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)
for match in matches:
print(match)
def load_files(directory):
"""
Given a directory name, return a dictionary mapping the filename of each
`.txt` file inside that directory to the file's contents as a string.
"""
files = os.listdir(directory)
mapping = dict()
for i in files:
with open(os.path.join(directory,i), "r") as f:
mapping[i] = f.read()
return mapping
def tokenize(document):
"""
Given a document (represented as a string), return a list of all of the
words in that document, in order.
Process document by coverting all words to lowercase, and removing any
punctuation or English stopwords.
"""
split = nltk.word_tokenize(document.lower())
processed = []
for i in split:
if i in nltk.corpus.stopwords.words("english"):
continue
if i[0] in string.ascii_lowercase:
processed.append(i)
return processed
def compute_idfs(documents):
"""
Given a dictionary of `documents` that maps names of documents to a list
of words, return a dictionary that maps words to their IDF values.
Any word that appears in at least one of the documents should be in the
resulting dictionary.
"""
all_words = dict()
for i in documents.values():
for j in i:
if j in all_words:
continue
contains = 0
for k in documents:
if j in documents[k]:
contains += 1
continue
all_words[j] = math.log(len(documents)/contains)
return all_words
def top_files(query, files, idfs, n):
"""
Given a `query` (a set of words), `files` (a dictionary mapping names of
files to a list of their words), and `idfs` (a dictionary mapping words
to their IDF values), return a list of the filenames of the the `n` top
files that match the query, ranked according to tf-idf.
"""
rankings = {}
for i in files:
tfidf = 0
for j in query:
tf = files[i].count(j)
tfidf += tf * idfs[j]
rankings[tfidf] = i
scores = list(rankings.keys())
scores.sort(reverse=True)
top_n = []
for i in scores[:n]:
top_n.append(rankings[i])
return top_n
def top_sentences(query, sentences, idfs, n):
"""
Given a `query` (a set of words), `sentences` (a dictionary mapping
sentences to a list of their words), and `idfs` (a dictionary mapping words
to their IDF values), return a list of the `n` top sentences that match
the query, ranked according to idf. If there are ties, preference should
be given to sentences that have a higher query term density.
"""
rankings = {}
for i in sentences:
idf = 0
tf_sum = 0
for j in query:
tf = sentences[i].count(j)
tf_sum += tf
idf += tf * idfs[j]
if idf not in rankings:
rankings[idf] = []
rankings[idf].append((i, tf))
scores = list(rankings.keys())
scores.sort(reverse=True)
top_n = []
for i in scores[:n]:
selected = rankings[i][0][0]
if len(rankings[i]) != 0:
max_tf = 0
for j in rankings[i]:
if j[1] > max_tf:
selected = j[0]
top_n.append(selected)
return top_n
if __name__ == "__main__":
main()