import nltk import sys import os, string, math FILE_MATCHES = 1 SENTENCE_MATCHES = 1 def main(): # Check command-line arguments if len(sys.argv) != 2: sys.exit("Usage: python questions.py corpus") # Calculate IDF values across files files = load_files(sys.argv[1]) file_words = { filename: tokenize(files[filename]) for filename in files } file_idfs = compute_idfs(file_words) # Prompt user for query query = set(tokenize(input("Query: "))) # Determine top file matches according to TF-IDF filenames = top_files(query, file_words, file_idfs, n=FILE_MATCHES) # Extract sentences from top files sentences = dict() for filename in filenames: for passage in files[filename].split("\n"): for sentence in nltk.sent_tokenize(passage): tokens = tokenize(sentence) if tokens: sentences[sentence] = tokens # Compute IDF values across sentences idfs = compute_idfs(sentences) # Determine top sentence matches matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES) for match in matches: print(match) def load_files(directory): """ Given a directory name, return a dictionary mapping the filename of each `.txt` file inside that directory to the file's contents as a string. """ files = os.listdir(directory) mapping = dict() for i in files: with open(os.path.join(directory,i), "r") as f: mapping[i] = f.read() return mapping def tokenize(document): """ Given a document (represented as a string), return a list of all of the words in that document, in order. Process document by coverting all words to lowercase, and removing any punctuation or English stopwords. """ split = nltk.word_tokenize(document.lower()) processed = [] for i in split: if i in nltk.corpus.stopwords.words("english"): continue if i[0] in string.ascii_lowercase: processed.append(i) return processed def compute_idfs(documents): """ Given a dictionary of `documents` that maps names of documents to a list of words, return a dictionary that maps words to their IDF values. Any word that appears in at least one of the documents should be in the resulting dictionary. """ all_words = dict() for i in documents.values(): for j in i: if j in all_words: continue contains = 0 for k in documents: if j in documents[k]: contains += 1 continue all_words[j] = math.log(len(documents)/contains) return all_words def top_files(query, files, idfs, n): """ Given a `query` (a set of words), `files` (a dictionary mapping names of files to a list of their words), and `idfs` (a dictionary mapping words to their IDF values), return a list of the filenames of the the `n` top files that match the query, ranked according to tf-idf. """ rankings = {} for i in files: tfidf = 0 for j in query: tf = files[i].count(j) tfidf += tf * idfs[j] rankings[tfidf] = i scores = list(rankings.keys()) scores.sort(reverse=True) top_n = [] for i in scores[:n]: top_n.append(rankings[i]) return top_n def top_sentences(query, sentences, idfs, n): """ Given a `query` (a set of words), `sentences` (a dictionary mapping sentences to a list of their words), and `idfs` (a dictionary mapping words to their IDF values), return a list of the `n` top sentences that match the query, ranked according to idf. If there are ties, preference should be given to sentences that have a higher query term density. """ rankings = {} for i in sentences: idf = 0 tf_sum = 0 for j in query: tf = sentences[i].count(j) tf_sum += tf idf += tf * idfs[j] if idf not in rankings: rankings[idf] = [] rankings[idf].append((i, tf)) scores = list(rankings.keys()) scores.sort(reverse=True) top_n = [] for i in scores[:n]: selected = rankings[i][0][0] if len(rankings[i]) != 0: max_tf = 0 for j in rankings[i]: if j[1] > max_tf: selected = j[0] top_n.append(selected) return top_n if __name__ == "__main__": main()