yigitcolakoglu
/
CS50AI_assignments


								import csv

								import sys


								from sklearn.model_selection import train_test_split

								from sklearn.neighbors import KNeighborsClassifier


								passv = lambda x: x

								MONTHS = ['Jan', 'Feb', 'Mar', 'April', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']


								TEST_SIZE = 0.4

								CONVERSION_FUNCTIONS = [int, float, int, float, int, float, float, float,

														float, float, MONTHS.index , int, int, int, int, lambda x: 1 if x == "Returning_Visitor" else 0,

														lambda x: 1 if x!="FALSE" else 0]


								def main():


									# Check command-line arguments

									if len(sys.argv) != 2:

										sys.exit("Usage: python shopping.py data")


									# Load data from spreadsheet and split into train and test sets

									evidence, labels = load_data(sys.argv[1])

									X_train, X_test, y_train, y_test = train_test_split(

										evidence, labels, test_size=TEST_SIZE

									)


									# Train model and make predictions

									model = train_model(X_train, y_train)

									predictions = model.predict(X_test)

									sensitivity, specificity = evaluate(y_test, predictions)


									# Print results

									print(f"Correct: {(y_test == predictions).sum()}")

									print(f"Incorrect: {(y_test != predictions).sum()}")

									print(f"True Positive Rate: {100 * sensitivity:.2f}%")

									print(f"True Negative Rate: {100 * specificity:.2f}%")


								def load_data(filename):

									"""

									Load shopping data from a CSV file `filename` and convert into a list of

									evidence lists and a list of labels. Return a tuple (evidence, labels).


									evidence should be a list of lists, where each list contains the

									following values, in order:

										- Administrative, an integer

										- Administrative_Duration, a floating point number

										- Informational, an integer

										- Informational_Duration, a floating point number

										- ProductRelated, an integer

										- ProductRelated_Duration, a floating point number

										- BounceRates, a floating point number

										- ExitRates, a floating point number

										- PageValues, a floating point number

										- SpecialDay, a floating point number

										- Month, an index from 0 (January) to 11 (December)

										- OperatingSystems, an integer

										- Browser, an integer

										- Region, an integer

										- TrafficType, an integer

										- VisitorType, an integer 0 (not returning) or 1 (returning)

										- Weekend, an integer 0 (if false) or 1 (if true)


									labels should be the corresponding list of labels, where each label

									is 1 if Revenue is true, and 0 otherwise.

									"""

									data = []

									labels = []

									with open("shopping.csv", "r") as f:

										reader = csv.reader(f)

										first = True

										for line in reader:

											if first:

												first = False

												continue

											row = list(line)

											data.append([None] * (len(row) - 1))

											for i in range(len(row) - 1):

												data[-1][i] = CONVERSION_FUNCTIONS[i](row[i])

											labels.append(1 if row[-1]!="FALSE" else 0)

									return data, labels


								def train_model(evidence, labels):

									"""

									Given a list of evidence lists and a list of labels, return a

									fitted k-nearest neighbor model (k=1) trained on the data.

									"""

									model = KNeighborsClassifier(n_neighbors=10)

									model.fit(evidence,labels)

									return model


								def evaluate(labels, predictions):

									"""

									Given a list of actual labels and a list of predicted labels,

									return a tuple (sensitivity, specificty).


									Assume each label is either a 1 (positive) or 0 (negative).


									`sensitivity` should be a floating-point value from 0 to 1

									representing the "true positive rate": the proportion of

									actual positive labels that were accurately identified.


									`specificity` should be a floating-point value from 0 to 1

									representing the "true negative rate": the proportion of

									actual negative labels that were accurately identified.

									"""

									true_n = 0

									true_p = 0

									negatives = 0

									positives = 0

									for i, j in zip(labels, predictions):

										if i == 0:

											if i == j:

												true_n += 1

											negatives += 1

										else:

											if i == j:

												true_p += 1

											positives += 1

									specificity = true_n / negatives

									sensitivity = true_p / positives

									return sensitivity, specificity


								if __name__ == "__main__":

									main()