My solutions to Harvard's online course CS50AI, An Introduction to Machine Learning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

302 lines
9.1 KiB

import math
import random
import time
class Nim():
def __init__(self, initial=[1, 3, 5, 7]):
"""
Initialize game board.
Each game board has
- `piles`: a list of how many elements remain in each pile
- `player`: 0 or 1 to indicate which player's turn
- `winner`: None, 0, or 1 to indicate who the winner is
"""
self.piles = initial.copy()
self.player = 0
self.winner = None
@classmethod
def available_actions(cls, piles):
"""
Nim.available_actions(piles) takes a `piles` list as input
and returns all of the available actions `(i, j)` in that state.
Action `(i, j)` represents the action of removing `j` items
from pile `i` (where piles are 0-indexed).
"""
actions = set()
for i, pile in enumerate(piles):
for j in range(1, pile + 1):
actions.add((i, j))
return actions
@classmethod
def other_player(cls, player):
"""
Nim.other_player(player) returns the player that is not
`player`. Assumes `player` is either 0 or 1.
"""
return 0 if player == 1 else 1
def switch_player(self):
"""
Switch the current player to the other player.
"""
self.player = Nim.other_player(self.player)
def move(self, action):
"""
Make the move `action` for the current player.
`action` must be a tuple `(i, j)`.
"""
pile, count = action
# Check for errors
if self.winner is not None:
raise Exception("Game already won")
elif pile < 0 or pile >= len(self.piles):
raise Exception("Invalid pile")
elif count < 1 or count > self.piles[pile]:
raise Exception("Invalid number of objects")
# Update pile
self.piles[pile] -= count
self.switch_player()
# Check for a winner
if all(pile == 0 for pile in self.piles):
self.winner = self.player
class NimAI():
def __init__(self, alpha=0.5, epsilon=0.2):
"""
Initialize AI with an empty Q-learning dictionary,
an alpha (learning) rate, and an epsilon rate.
The Q-learning dictionary maps `(state, action)`
pairs to a Q-value (a number).
- `state` is a tuple of remaining piles, e.g. (1, 1, 4, 4)
- `action` is a tuple `(i, j)` for an action
"""
self.q = dict()
self.alpha = alpha
self.epsilon = epsilon
def update(self, old_state, action, new_state, reward):
"""
Update Q-learning model, given an old state, an action taken
in that state, a new resulting state, and the reward received
from taking that action.
"""
old = self.get_q_value(old_state, action)
best_future = self.best_future_reward(new_state)
self.update_q_value(old_state, action, old, reward, best_future)
def get_q_value(self, state, action):
"""
Return the Q-value for the state `state` and the action `action`.
If no Q-value exists yet in `self.q`, return 0.
"""
if (tuple(state), action,) not in self.q:
return 0
return self.q[(tuple(state), action)]
def update_q_value(self, state, action, old_q, reward, future_rewards):
"""
Update the Q-value for the state `state` and the action `action`
given the previous Q-value `old_q`, a current reward `reward`,
and an estiamte of future rewards `future_rewards`.
Use the formula:
Q(s, a) <- old value estimate
+ alpha * (new value estimate - old value estimate)
where `old value estimate` is the previous Q-value,
`alpha` is the learning rate, and `new value estimate`
is the sum of the current reward and estimated future rewards.
"""
self.q[(tuple(state), action)] = old_q + self.alpha*(reward + future_rewards - old_q)
def best_future_reward(self, state):
"""
Given a state `state`, consider all possible `(state, action)`
pairs available in that state and return the maximum of all
of their Q-values.
Use 0 as the Q-value if a `(state, action)` pair has no
Q-value in `self.q`. If there are no available actions in
`state`, return 0.
"""
actions = Nim.available_actions(state)
if not actions:
return 0
best = [0, set()]
for i in actions:
val = self.get_q_value(state, i)
if not val:
continue
elif val > best[0]:
best[1] = {val}
elif val == best[0]:
best[1].add(val)
if best[1]:
return next(iter(best[1]))
else:
return 0
def choose_action(self, state, epsilon=True):
"""
Given a state `state`, return an action `(i, j)` to take.
If `epsilon` is `False`, then return the best action
available in the state (the one with the highest Q-value,
using 0 for pairs that have no Q-values).
If `epsilon` is `True`, then with probability
`self.epsilon` choose a random available action,
otherwise choose the best action available.
If multiple actions have the same Q-value, any of those
options is an acceptable return value.
"""
actions = Nim.available_actions(state)
if epsilon:
epsilon = self.epsilon * 100
else:
epsilon = 0
if 100 - random.randint(0, 100) >= epsilon:
best = [0, set()]
for i in actions:
val = self.get_q_value(state, i)
if not val:
continue
elif val > best[0]:
best[1] = {i}
elif val == best[0]:
best[1].add(i)
if best[1]:
return next(iter(best[1]))
else:
return random.sample(actions, 1)[0]
else:
return random.sample(actions, 1)[0]
def train(n):
"""
Train an AI by playing `n` games against itself.
"""
player = NimAI()
# Play n games
for i in range(n):
print(f"Playing training game {i + 1}")
game = Nim()
# Keep track of last move made by either player
last = {
0: {"state": None, "action": None},
1: {"state": None, "action": None}
}
# Game loop
while True:
# Keep track of current state and action
state = game.piles.copy()
action = player.choose_action(game.piles)
# Keep track of last state and action
last[game.player]["state"] = state
last[game.player]["action"] = action
# Make move
game.move(action)
new_state = game.piles.copy()
# When game is over, update Q values with rewards
if game.winner is not None:
player.update(state, action, new_state, -1)
player.update(
last[game.player]["state"],
last[game.player]["action"],
new_state,
1
)
break
# If game is continuing, no rewards yet
elif last[game.player]["state"] is not None:
player.update(
last[game.player]["state"],
last[game.player]["action"],
new_state,
0
)
print("Done training")
# Return the trained AI
return player
def play(ai, human_player=None):
"""
Play human game against the AI.
`human_player` can be set to 0 or 1 to specify whether
human player moves first or second.
"""
# If no player order set, choose human's order randomly
if human_player is None:
human_player = random.randint(0, 1)
# Create new game
game = Nim()
# Game loop
while True:
# Print contents of piles
print()
print("Piles:")
for i, pile in enumerate(game.piles):
print(f"Pile {i}: {pile}")
print()
# Compute available actions
available_actions = Nim.available_actions(game.piles)
time.sleep(1)
# Let human make a move
if game.player == human_player:
print("Your Turn")
while True:
pile = int(input("Choose Pile: "))
count = int(input("Choose Count: "))
if (pile, count) in available_actions:
break
print("Invalid move, try again.")
# Have AI make a move
else:
print("AI's Turn")
pile, count = ai.choose_action(game.piles, epsilon=False)
print(f"AI chose to take {count} from pile {pile}.")
# Make move
game.move((pile, count))
# Check for winner
if game.winner is not None:
print()
print("GAME OVER")
winner = "Human" if game.winner == human_player else "AI"
print(f"Winner is {winner}")
return