My solutions to Harvard's online course CS50AI, An Introduction to Machine Learning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

128 lines
3.8 KiB

import csv
import sys
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
passv = lambda x: x
MONTHS = ['Jan', 'Feb', 'Mar', 'April', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
TEST_SIZE = 0.4
CONVERSION_FUNCTIONS = [int, float, int, float, int, float, float, float,
float, float, MONTHS.index , int, int, int, int, lambda x: 1 if x == "Returning_Visitor" else 0,
lambda x: 1 if x!="FALSE" else 0]
def main():
# Check command-line arguments
if len(sys.argv) != 2:
sys.exit("Usage: python shopping.py data")
# Load data from spreadsheet and split into train and test sets
evidence, labels = load_data(sys.argv[1])
X_train, X_test, y_train, y_test = train_test_split(
evidence, labels, test_size=TEST_SIZE
)
# Train model and make predictions
model = train_model(X_train, y_train)
predictions = model.predict(X_test)
sensitivity, specificity = evaluate(y_test, predictions)
# Print results
print(f"Correct: {(y_test == predictions).sum()}")
print(f"Incorrect: {(y_test != predictions).sum()}")
print(f"True Positive Rate: {100 * sensitivity:.2f}%")
print(f"True Negative Rate: {100 * specificity:.2f}%")
def load_data(filename):
"""
Load shopping data from a CSV file `filename` and convert into a list of
evidence lists and a list of labels. Return a tuple (evidence, labels).
evidence should be a list of lists, where each list contains the
following values, in order:
- Administrative, an integer
- Administrative_Duration, a floating point number
- Informational, an integer
- Informational_Duration, a floating point number
- ProductRelated, an integer
- ProductRelated_Duration, a floating point number
- BounceRates, a floating point number
- ExitRates, a floating point number
- PageValues, a floating point number
- SpecialDay, a floating point number
- Month, an index from 0 (January) to 11 (December)
- OperatingSystems, an integer
- Browser, an integer
- Region, an integer
- TrafficType, an integer
- VisitorType, an integer 0 (not returning) or 1 (returning)
- Weekend, an integer 0 (if false) or 1 (if true)
labels should be the corresponding list of labels, where each label
is 1 if Revenue is true, and 0 otherwise.
"""
data = []
labels = []
with open("shopping.csv", "r") as f:
reader = csv.reader(f)
first = True
for line in reader:
if first:
first = False
continue
row = list(line)
data.append([None] * (len(row) - 1))
for i in range(len(row) - 1):
data[-1][i] = CONVERSION_FUNCTIONS[i](row[i])
labels.append(1 if row[-1]!="FALSE" else 0)
return data, labels
def train_model(evidence, labels):
"""
Given a list of evidence lists and a list of labels, return a
fitted k-nearest neighbor model (k=1) trained on the data.
"""
model = KNeighborsClassifier(n_neighbors=10)
model.fit(evidence,labels)
return model
def evaluate(labels, predictions):
"""
Given a list of actual labels and a list of predicted labels,
return a tuple (sensitivity, specificty).
Assume each label is either a 1 (positive) or 0 (negative).
`sensitivity` should be a floating-point value from 0 to 1
representing the "true positive rate": the proportion of
actual positive labels that were accurately identified.
`specificity` should be a floating-point value from 0 to 1
representing the "true negative rate": the proportion of
actual negative labels that were accurately identified.
"""
true_n = 0
true_p = 0
negatives = 0
positives = 0
for i, j in zip(labels, predictions):
if i == 0:
if i == j:
true_n += 1
negatives += 1
else:
if i == j:
true_p += 1
positives += 1
specificity = true_n / negatives
sensitivity = true_p / positives
return sensitivity, specificity
if __name__ == "__main__":
main()