import csv import sys from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier passv = lambda x: x MONTHS = ['Jan', 'Feb', 'Mar', 'April', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] TEST_SIZE = 0.4 CONVERSION_FUNCTIONS = [int, float, int, float, int, float, float, float, float, float, MONTHS.index , int, int, int, int, lambda x: 1 if x == "Returning_Visitor" else 0, lambda x: 1 if x!="FALSE" else 0] def main(): # Check command-line arguments if len(sys.argv) != 2: sys.exit("Usage: python shopping.py data") # Load data from spreadsheet and split into train and test sets evidence, labels = load_data(sys.argv[1]) X_train, X_test, y_train, y_test = train_test_split( evidence, labels, test_size=TEST_SIZE ) # Train model and make predictions model = train_model(X_train, y_train) predictions = model.predict(X_test) sensitivity, specificity = evaluate(y_test, predictions) # Print results print(f"Correct: {(y_test == predictions).sum()}") print(f"Incorrect: {(y_test != predictions).sum()}") print(f"True Positive Rate: {100 * sensitivity:.2f}%") print(f"True Negative Rate: {100 * specificity:.2f}%") def load_data(filename): """ Load shopping data from a CSV file `filename` and convert into a list of evidence lists and a list of labels. Return a tuple (evidence, labels). evidence should be a list of lists, where each list contains the following values, in order: - Administrative, an integer - Administrative_Duration, a floating point number - Informational, an integer - Informational_Duration, a floating point number - ProductRelated, an integer - ProductRelated_Duration, a floating point number - BounceRates, a floating point number - ExitRates, a floating point number - PageValues, a floating point number - SpecialDay, a floating point number - Month, an index from 0 (January) to 11 (December) - OperatingSystems, an integer - Browser, an integer - Region, an integer - TrafficType, an integer - VisitorType, an integer 0 (not returning) or 1 (returning) - Weekend, an integer 0 (if false) or 1 (if true) labels should be the corresponding list of labels, where each label is 1 if Revenue is true, and 0 otherwise. """ data = [] labels = [] with open("shopping.csv", "r") as f: reader = csv.reader(f) first = True for line in reader: if first: first = False continue row = list(line) data.append([None] * (len(row) - 1)) for i in range(len(row) - 1): data[-1][i] = CONVERSION_FUNCTIONS[i](row[i]) labels.append(1 if row[-1]!="FALSE" else 0) return data, labels def train_model(evidence, labels): """ Given a list of evidence lists and a list of labels, return a fitted k-nearest neighbor model (k=1) trained on the data. """ model = KNeighborsClassifier(n_neighbors=10) model.fit(evidence,labels) return model def evaluate(labels, predictions): """ Given a list of actual labels and a list of predicted labels, return a tuple (sensitivity, specificty). Assume each label is either a 1 (positive) or 0 (negative). `sensitivity` should be a floating-point value from 0 to 1 representing the "true positive rate": the proportion of actual positive labels that were accurately identified. `specificity` should be a floating-point value from 0 to 1 representing the "true negative rate": the proportion of actual negative labels that were accurately identified. """ true_n = 0 true_p = 0 negatives = 0 positives = 0 for i, j in zip(labels, predictions): if i == 0: if i == j: true_n += 1 negatives += 1 else: if i == j: true_p += 1 positives += 1 specificity = true_n / negatives sensitivity = true_p / positives return sensitivity, specificity if __name__ == "__main__": main()