yigitcolakoglu
/
CS50AI_assignments


								import nltk

								import sys

								import os, string, math


								FILE_MATCHES = 1

								SENTENCE_MATCHES = 1


								def main():


								    # Check command-line arguments

								    if len(sys.argv) != 2:

								        sys.exit("Usage: python questions.py corpus")


								    # Calculate IDF values across files

								    files = load_files(sys.argv[1])

								    file_words = {

								        filename: tokenize(files[filename])

								        for filename in files

								    }

								    file_idfs = compute_idfs(file_words)


								    # Prompt user for query

								    query = set(tokenize(input("Query: ")))


								    # Determine top file matches according to TF-IDF

								    filenames = top_files(query, file_words, file_idfs, n=FILE_MATCHES)


								    # Extract sentences from top files

								    sentences = dict()

								    for filename in filenames:

								        for passage in files[filename].split("\n"):

								            for sentence in nltk.sent_tokenize(passage):

								                tokens = tokenize(sentence)

								                if tokens:

								                    sentences[sentence] = tokens


								    # Compute IDF values across sentences

								    idfs = compute_idfs(sentences)


								    # Determine top sentence matches

								    matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)

								    for match in matches:

								        print(match)


								def load_files(directory):

								    """

								    Given a directory name, return a dictionary mapping the filename of each

								    `.txt` file inside that directory to the file's contents as a string.

								    """

								    files = os.listdir(directory)

								    mapping = dict()

								    for i in files:

								        with open(os.path.join(directory,i), "r") as f:

								            mapping[i] = f.read()

								    return mapping


								def tokenize(document):

								    """

								    Given a document (represented as a string), return a list of all of the

								    words in that document, in order.


								    Process document by coverting all words to lowercase, and removing any

								    punctuation or English stopwords.

								    """

								    split = nltk.word_tokenize(document.lower())

								    processed = []

								    for i in split:

								        if i in nltk.corpus.stopwords.words("english"):

								            continue

								        if  i[0] in string.ascii_lowercase:

								            processed.append(i)

								    return processed


								def compute_idfs(documents):

								    """

								    Given a dictionary of `documents` that maps names of documents to a list

								    of words, return a dictionary that maps words to their IDF values.


								    Any word that appears in at least one of the documents should be in the

								    resulting dictionary.

								    """

								    all_words = dict()

								    for i in documents.values():

								        for j in i:

								            if j in all_words:

								                continue

								            contains = 0

								            for k in documents:

								                if j in documents[k]:

								                    contains += 1

								                    continue

								            all_words[j] = math.log(len(documents)/contains)

								    return all_words


								def top_files(query, files, idfs, n):

								    """

								    Given a `query` (a set of words), `files` (a dictionary mapping names of

								    files to a list of their words), and `idfs` (a dictionary mapping words

								    to their IDF values), return a list of the filenames of the the `n` top

								    files that match the query, ranked according to tf-idf.

								    """

								    rankings = {}

								    for i in files:

								        tfidf = 0

								        for j in query:

								            tf = files[i].count(j)

								            tfidf += tf * idfs[j]

								        rankings[tfidf] = i


								    scores = list(rankings.keys())

								    scores.sort(reverse=True)

								    top_n = []

								    for i in scores[:n]:

								        top_n.append(rankings[i])

								    return top_n


								def top_sentences(query, sentences, idfs, n):

								    """

								    Given a `query` (a set of words), `sentences` (a dictionary mapping

								    sentences to a list of their words), and `idfs` (a dictionary mapping words

								    to their IDF values), return a list of the `n` top sentences that match

								    the query, ranked according to idf. If there are ties, preference should

								    be given to sentences that have a higher query term density.

								    """

								    rankings = {}

								    for i in sentences:

								        idf = 0

								        tf_sum = 0

								        for j in query:

								            tf =  sentences[i].count(j)

								            tf_sum += tf

								            idf += tf * idfs[j]

								        if idf not in rankings:

								            rankings[idf] = []

								        rankings[idf].append((i, tf))


								    scores = list(rankings.keys())

								    scores.sort(reverse=True)

								    top_n = []

								    for i in scores[:n]:

								        selected = rankings[i][0][0]

								        if len(rankings[i]) != 0:

								            max_tf = 0

								            for j in rankings[i]:

								                if j[1] > max_tf:

								                    selected = j[0]

								        top_n.append(selected)

								    return top_n


								if __name__ == "__main__":

								    main()