import math import random import time class Nim(): def __init__(self, initial=[1, 3, 5, 7]): """ Initialize game board. Each game board has - `piles`: a list of how many elements remain in each pile - `player`: 0 or 1 to indicate which player's turn - `winner`: None, 0, or 1 to indicate who the winner is """ self.piles = initial.copy() self.player = 0 self.winner = None @classmethod def available_actions(cls, piles): """ Nim.available_actions(piles) takes a `piles` list as input and returns all of the available actions `(i, j)` in that state. Action `(i, j)` represents the action of removing `j` items from pile `i` (where piles are 0-indexed). """ actions = set() for i, pile in enumerate(piles): for j in range(1, pile + 1): actions.add((i, j)) return actions @classmethod def other_player(cls, player): """ Nim.other_player(player) returns the player that is not `player`. Assumes `player` is either 0 or 1. """ return 0 if player == 1 else 1 def switch_player(self): """ Switch the current player to the other player. """ self.player = Nim.other_player(self.player) def move(self, action): """ Make the move `action` for the current player. `action` must be a tuple `(i, j)`. """ pile, count = action # Check for errors if self.winner is not None: raise Exception("Game already won") elif pile < 0 or pile >= len(self.piles): raise Exception("Invalid pile") elif count < 1 or count > self.piles[pile]: raise Exception("Invalid number of objects") # Update pile self.piles[pile] -= count self.switch_player() # Check for a winner if all(pile == 0 for pile in self.piles): self.winner = self.player class NimAI(): def __init__(self, alpha=0.5, epsilon=0.2): """ Initialize AI with an empty Q-learning dictionary, an alpha (learning) rate, and an epsilon rate. The Q-learning dictionary maps `(state, action)` pairs to a Q-value (a number). - `state` is a tuple of remaining piles, e.g. (1, 1, 4, 4) - `action` is a tuple `(i, j)` for an action """ self.q = dict() self.alpha = alpha self.epsilon = epsilon def update(self, old_state, action, new_state, reward): """ Update Q-learning model, given an old state, an action taken in that state, a new resulting state, and the reward received from taking that action. """ old = self.get_q_value(old_state, action) best_future = self.best_future_reward(new_state) self.update_q_value(old_state, action, old, reward, best_future) def get_q_value(self, state, action): """ Return the Q-value for the state `state` and the action `action`. If no Q-value exists yet in `self.q`, return 0. """ if (tuple(state), action,) not in self.q: return 0 return self.q[(tuple(state), action)] def update_q_value(self, state, action, old_q, reward, future_rewards): """ Update the Q-value for the state `state` and the action `action` given the previous Q-value `old_q`, a current reward `reward`, and an estiamte of future rewards `future_rewards`. Use the formula: Q(s, a) <- old value estimate + alpha * (new value estimate - old value estimate) where `old value estimate` is the previous Q-value, `alpha` is the learning rate, and `new value estimate` is the sum of the current reward and estimated future rewards. """ self.q[(tuple(state), action)] = old_q + self.alpha*(reward + future_rewards - old_q) def best_future_reward(self, state): """ Given a state `state`, consider all possible `(state, action)` pairs available in that state and return the maximum of all of their Q-values. Use 0 as the Q-value if a `(state, action)` pair has no Q-value in `self.q`. If there are no available actions in `state`, return 0. """ actions = Nim.available_actions(state) if not actions: return 0 best = [0, set()] for i in actions: val = self.get_q_value(state, i) if not val: continue elif val > best[0]: best[1] = {val} elif val == best[0]: best[1].add(val) if best[1]: return next(iter(best[1])) else: return 0 def choose_action(self, state, epsilon=True): """ Given a state `state`, return an action `(i, j)` to take. If `epsilon` is `False`, then return the best action available in the state (the one with the highest Q-value, using 0 for pairs that have no Q-values). If `epsilon` is `True`, then with probability `self.epsilon` choose a random available action, otherwise choose the best action available. If multiple actions have the same Q-value, any of those options is an acceptable return value. """ actions = Nim.available_actions(state) if epsilon: epsilon = self.epsilon * 100 else: epsilon = 0 if 100 - random.randint(0, 100) >= epsilon: best = [0, set()] for i in actions: val = self.get_q_value(state, i) if not val: continue elif val > best[0]: best[1] = {i} elif val == best[0]: best[1].add(i) if best[1]: return next(iter(best[1])) else: return random.sample(actions, 1)[0] else: return random.sample(actions, 1)[0] def train(n): """ Train an AI by playing `n` games against itself. """ player = NimAI() # Play n games for i in range(n): print(f"Playing training game {i + 1}") game = Nim() # Keep track of last move made by either player last = { 0: {"state": None, "action": None}, 1: {"state": None, "action": None} } # Game loop while True: # Keep track of current state and action state = game.piles.copy() action = player.choose_action(game.piles) # Keep track of last state and action last[game.player]["state"] = state last[game.player]["action"] = action # Make move game.move(action) new_state = game.piles.copy() # When game is over, update Q values with rewards if game.winner is not None: player.update(state, action, new_state, -1) player.update( last[game.player]["state"], last[game.player]["action"], new_state, 1 ) break # If game is continuing, no rewards yet elif last[game.player]["state"] is not None: player.update( last[game.player]["state"], last[game.player]["action"], new_state, 0 ) print("Done training") # Return the trained AI return player def play(ai, human_player=None): """ Play human game against the AI. `human_player` can be set to 0 or 1 to specify whether human player moves first or second. """ # If no player order set, choose human's order randomly if human_player is None: human_player = random.randint(0, 1) # Create new game game = Nim() # Game loop while True: # Print contents of piles print() print("Piles:") for i, pile in enumerate(game.piles): print(f"Pile {i}: {pile}") print() # Compute available actions available_actions = Nim.available_actions(game.piles) time.sleep(1) # Let human make a move if game.player == human_player: print("Your Turn") while True: pile = int(input("Choose Pile: ")) count = int(input("Choose Count: ")) if (pile, count) in available_actions: break print("Invalid move, try again.") # Have AI make a move else: print("AI's Turn") pile, count = ai.choose_action(game.piles, epsilon=False) print(f"AI chose to take {count} from pile {pile}.") # Make move game.move((pile, count)) # Check for winner if game.winner is not None: print() print("GAME OVER") winner = "Human" if game.winner == human_player else "AI" print(f"Winner is {winner}") return