import math
|
|
import random
|
|
import time
|
|
|
|
|
|
class Nim():
|
|
|
|
def __init__(self, initial=[1, 3, 5, 7]):
|
|
"""
|
|
Initialize game board.
|
|
Each game board has
|
|
- `piles`: a list of how many elements remain in each pile
|
|
- `player`: 0 or 1 to indicate which player's turn
|
|
- `winner`: None, 0, or 1 to indicate who the winner is
|
|
"""
|
|
self.piles = initial.copy()
|
|
self.player = 0
|
|
self.winner = None
|
|
|
|
@classmethod
|
|
def available_actions(cls, piles):
|
|
"""
|
|
Nim.available_actions(piles) takes a `piles` list as input
|
|
and returns all of the available actions `(i, j)` in that state.
|
|
|
|
Action `(i, j)` represents the action of removing `j` items
|
|
from pile `i` (where piles are 0-indexed).
|
|
"""
|
|
actions = set()
|
|
for i, pile in enumerate(piles):
|
|
for j in range(1, pile + 1):
|
|
actions.add((i, j))
|
|
return actions
|
|
|
|
@classmethod
|
|
def other_player(cls, player):
|
|
"""
|
|
Nim.other_player(player) returns the player that is not
|
|
`player`. Assumes `player` is either 0 or 1.
|
|
"""
|
|
return 0 if player == 1 else 1
|
|
|
|
def switch_player(self):
|
|
"""
|
|
Switch the current player to the other player.
|
|
"""
|
|
self.player = Nim.other_player(self.player)
|
|
|
|
def move(self, action):
|
|
"""
|
|
Make the move `action` for the current player.
|
|
`action` must be a tuple `(i, j)`.
|
|
"""
|
|
pile, count = action
|
|
|
|
# Check for errors
|
|
if self.winner is not None:
|
|
raise Exception("Game already won")
|
|
elif pile < 0 or pile >= len(self.piles):
|
|
raise Exception("Invalid pile")
|
|
elif count < 1 or count > self.piles[pile]:
|
|
raise Exception("Invalid number of objects")
|
|
|
|
# Update pile
|
|
self.piles[pile] -= count
|
|
self.switch_player()
|
|
|
|
# Check for a winner
|
|
if all(pile == 0 for pile in self.piles):
|
|
self.winner = self.player
|
|
|
|
|
|
class NimAI():
|
|
|
|
def __init__(self, alpha=0.5, epsilon=0.2):
|
|
"""
|
|
Initialize AI with an empty Q-learning dictionary,
|
|
an alpha (learning) rate, and an epsilon rate.
|
|
|
|
The Q-learning dictionary maps `(state, action)`
|
|
pairs to a Q-value (a number).
|
|
- `state` is a tuple of remaining piles, e.g. (1, 1, 4, 4)
|
|
- `action` is a tuple `(i, j)` for an action
|
|
"""
|
|
self.q = dict()
|
|
self.alpha = alpha
|
|
self.epsilon = epsilon
|
|
|
|
def update(self, old_state, action, new_state, reward):
|
|
"""
|
|
Update Q-learning model, given an old state, an action taken
|
|
in that state, a new resulting state, and the reward received
|
|
from taking that action.
|
|
"""
|
|
old = self.get_q_value(old_state, action)
|
|
best_future = self.best_future_reward(new_state)
|
|
self.update_q_value(old_state, action, old, reward, best_future)
|
|
|
|
def get_q_value(self, state, action):
|
|
"""
|
|
Return the Q-value for the state `state` and the action `action`.
|
|
If no Q-value exists yet in `self.q`, return 0.
|
|
"""
|
|
if (tuple(state), action,) not in self.q:
|
|
return 0
|
|
return self.q[(tuple(state), action)]
|
|
|
|
def update_q_value(self, state, action, old_q, reward, future_rewards):
|
|
"""
|
|
Update the Q-value for the state `state` and the action `action`
|
|
given the previous Q-value `old_q`, a current reward `reward`,
|
|
and an estiamte of future rewards `future_rewards`.
|
|
|
|
Use the formula:
|
|
|
|
Q(s, a) <- old value estimate
|
|
+ alpha * (new value estimate - old value estimate)
|
|
|
|
where `old value estimate` is the previous Q-value,
|
|
`alpha` is the learning rate, and `new value estimate`
|
|
is the sum of the current reward and estimated future rewards.
|
|
"""
|
|
self.q[(tuple(state), action)] = old_q + self.alpha*(reward + future_rewards - old_q)
|
|
|
|
def best_future_reward(self, state):
|
|
"""
|
|
Given a state `state`, consider all possible `(state, action)`
|
|
pairs available in that state and return the maximum of all
|
|
of their Q-values.
|
|
|
|
Use 0 as the Q-value if a `(state, action)` pair has no
|
|
Q-value in `self.q`. If there are no available actions in
|
|
`state`, return 0.
|
|
"""
|
|
actions = Nim.available_actions(state)
|
|
if not actions:
|
|
return 0
|
|
best = [0, set()]
|
|
for i in actions:
|
|
val = self.get_q_value(state, i)
|
|
if not val:
|
|
continue
|
|
elif val > best[0]:
|
|
best[1] = {val}
|
|
elif val == best[0]:
|
|
best[1].add(val)
|
|
if best[1]:
|
|
return next(iter(best[1]))
|
|
else:
|
|
return 0
|
|
|
|
def choose_action(self, state, epsilon=True):
|
|
"""
|
|
Given a state `state`, return an action `(i, j)` to take.
|
|
|
|
If `epsilon` is `False`, then return the best action
|
|
available in the state (the one with the highest Q-value,
|
|
using 0 for pairs that have no Q-values).
|
|
|
|
If `epsilon` is `True`, then with probability
|
|
`self.epsilon` choose a random available action,
|
|
otherwise choose the best action available.
|
|
|
|
If multiple actions have the same Q-value, any of those
|
|
options is an acceptable return value.
|
|
"""
|
|
actions = Nim.available_actions(state)
|
|
if epsilon:
|
|
epsilon = self.epsilon * 100
|
|
else:
|
|
epsilon = 0
|
|
if 100 - random.randint(0, 100) >= epsilon:
|
|
best = [0, set()]
|
|
for i in actions:
|
|
val = self.get_q_value(state, i)
|
|
if not val:
|
|
continue
|
|
elif val > best[0]:
|
|
best[1] = {i}
|
|
elif val == best[0]:
|
|
best[1].add(i)
|
|
if best[1]:
|
|
return next(iter(best[1]))
|
|
else:
|
|
return random.sample(actions, 1)[0]
|
|
else:
|
|
return random.sample(actions, 1)[0]
|
|
|
|
|
|
def train(n):
|
|
"""
|
|
Train an AI by playing `n` games against itself.
|
|
"""
|
|
|
|
player = NimAI()
|
|
|
|
# Play n games
|
|
for i in range(n):
|
|
print(f"Playing training game {i + 1}")
|
|
game = Nim()
|
|
|
|
# Keep track of last move made by either player
|
|
last = {
|
|
0: {"state": None, "action": None},
|
|
1: {"state": None, "action": None}
|
|
}
|
|
|
|
# Game loop
|
|
while True:
|
|
|
|
# Keep track of current state and action
|
|
state = game.piles.copy()
|
|
action = player.choose_action(game.piles)
|
|
|
|
# Keep track of last state and action
|
|
last[game.player]["state"] = state
|
|
last[game.player]["action"] = action
|
|
|
|
# Make move
|
|
game.move(action)
|
|
new_state = game.piles.copy()
|
|
|
|
# When game is over, update Q values with rewards
|
|
if game.winner is not None:
|
|
player.update(state, action, new_state, -1)
|
|
player.update(
|
|
last[game.player]["state"],
|
|
last[game.player]["action"],
|
|
new_state,
|
|
1
|
|
)
|
|
break
|
|
|
|
# If game is continuing, no rewards yet
|
|
elif last[game.player]["state"] is not None:
|
|
player.update(
|
|
last[game.player]["state"],
|
|
last[game.player]["action"],
|
|
new_state,
|
|
0
|
|
)
|
|
|
|
print("Done training")
|
|
|
|
# Return the trained AI
|
|
return player
|
|
|
|
|
|
def play(ai, human_player=None):
|
|
"""
|
|
Play human game against the AI.
|
|
`human_player` can be set to 0 or 1 to specify whether
|
|
human player moves first or second.
|
|
"""
|
|
|
|
# If no player order set, choose human's order randomly
|
|
if human_player is None:
|
|
human_player = random.randint(0, 1)
|
|
|
|
# Create new game
|
|
game = Nim()
|
|
|
|
# Game loop
|
|
while True:
|
|
|
|
# Print contents of piles
|
|
print()
|
|
print("Piles:")
|
|
for i, pile in enumerate(game.piles):
|
|
print(f"Pile {i}: {pile}")
|
|
print()
|
|
|
|
# Compute available actions
|
|
available_actions = Nim.available_actions(game.piles)
|
|
time.sleep(1)
|
|
|
|
# Let human make a move
|
|
if game.player == human_player:
|
|
print("Your Turn")
|
|
while True:
|
|
pile = int(input("Choose Pile: "))
|
|
count = int(input("Choose Count: "))
|
|
if (pile, count) in available_actions:
|
|
break
|
|
print("Invalid move, try again.")
|
|
|
|
# Have AI make a move
|
|
else:
|
|
print("AI's Turn")
|
|
pile, count = ai.choose_action(game.piles, epsilon=False)
|
|
print(f"AI chose to take {count} from pile {pile}.")
|
|
|
|
# Make move
|
|
game.move((pile, count))
|
|
|
|
# Check for winner
|
|
if game.winner is not None:
|
|
print()
|
|
print("GAME OVER")
|
|
winner = "Human" if game.winner == human_player else "AI"
|
|
print(f"Winner is {winner}")
|
|
return
|