{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# More Tic-Tac-Toe and a Simple Robot Arm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For this assignment, you will use the reinforcement learning algorithm, Q learning, with a neural network to approximate the Q function. You will apply this to the game Tic-Tac-Toe and to the control of a simple robot arm.\n", "\n", "Most of the code is provided. You are asked to make specific modifications and find parameter values that result in good performance on these tasks. The two tasks will probably require different parameter values." ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "Download necessary code from [ttt_arm.zip](https://www.cs.colostate.edu/~anderson/cs545/notebooks/ttt_arm.zip)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T19:56:38.171397Z", "start_time": "2024-10-24T19:56:37.398272Z" }, "tags": [] }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", " \n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from IPython.display import display, clear_output\n", "import pandas as pd\n", "import pickle" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## Tic Tac Toe" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T19:56:38.194043Z", "start_time": "2024-10-24T19:56:38.172790Z" }, "tags": [] }, "outputs": [], "source": [ "import tictactoe" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T20:09:04.569252Z", "start_time": "2024-10-24T20:09:04.538820Z" }, "tags": [] }, "outputs": [], "source": [ "class Game:\n", "\n", " def __init__(self, environment, agents):\n", "\n", " self.env = environment\n", " self.agents = agents\n", "\n", " def train(self, parms, verbose=True):\n", "\n", " n_batches = parms['n_batches']\n", " n_games_per_batch = parms['n_games_per_batch']\n", " n_epochs = parms['n_epochs']\n", " method = parms['method']\n", " learning_rate = parms['learning_rate']\n", " epsilon = parms['initial_epsilon']\n", " final_epsilon = parms['final_epsilon']\n", "\n", " ttt = self.env\n", "\n", " epsilon_decay = np.exp((np.log(final_epsilon) - np.log(epsilon)) / (n_batches)) # to produce this final value\n", " \n", " epsilon_trace = []\n", " outcomes = []\n", "\n", " for batch in range(n_batches):\n", " agents['X'].clear_samples()\n", " agents['O'].clear_samples()\n", " \n", " for gamei in range(n_games_per_batch):\n", "\n", " ttt.initialize()\n", " done = False\n", "\n", " while not done:\n", "\n", " agent = agents[ttt.player]\n", " obs = ttt.observe()\n", " if len(self.env.valid_actions()) == 9:\n", " action = np.random.choice(self.env.valid_actions())\n", " # print('picked random action at start of game') \n", " else:\n", " action = agent.epsilon_greedy(epsilon)\n", " # print('picked best action')\n", "\n", " ttt.act(action)\n", " r = ttt.reinforcement()\n", " done = ttt.terminal_state()\n", "\n", " agent.add_sample(obs, action, r, done)\n", "\n", " outcomes.append(r)\n", "\n", " # end n_trials_per_batch\n", " self.agents['X'].train(n_epochs, method, learning_rate)\n", " self.agents['O'].train(n_epochs, method, learning_rate)\n", "\n", " epsilon_trace.append(epsilon)\n", " epsilon *= epsilon_decay\n", "\n", " if verbose and (len(outcomes) % ((n_batches * n_games_per_batch) // 20) == 0):\n", " print(f'{len(outcomes)}/{n_batches * n_games_per_batch} games, {np.mean(outcomes):.2f} outcome mean')\n", "\n", " if verbose:\n", " plt.subplot(3, 1, 1)\n", " n_per = 10\n", " n_bins = len(outcomes) // n_per\n", " outcomes_binned = np.array(outcomes).reshape(-1, n_per)\n", " avgs = outcomes_binned.mean(1)\n", " xs = np.linspace(n_per, n_per * n_bins, len(avgs))\n", " plt.plot(xs, avgs)\n", " plt.axhline(y=0, color='orange', ls='--')\n", " plt.ylabel('R')\n", " \n", " plt.subplot(3, 1, 2)\n", " plt.plot(xs, np.sum(outcomes_binned == -1, axis=1), 'r-', label='O Wins')\n", " plt.plot(xs, np.sum(outcomes_binned == 0, axis=1), 'b-', label='Draws')\n", " plt.plot(xs, np.sum(outcomes_binned == 1, axis=1), 'g-', label='X Wins')\n", " plt.legend(loc='center')\n", " plt.ylabel(f'Number of Games\\nin Bins of {n_per:d}')\n", " \n", " plt.subplot(3, 1, 3)\n", " plt.plot(epsilon_trace)\n", " plt.ylabel(r'$\\epsilon$')\n", "\n", " return outcomes, epsilon_trace\n", "\n", "\n", " def play_game(self, epsilon=0.0, verbose=True):\n", " ttt = self.env\n", " agents = self.agents\n", " ttt.initialize()\n", " while True:\n", " agent = agents[ttt.player]\n", " obs = ttt.observe()\n", " if len(ttt.valid_actions()) == 9:\n", " action = agent.epsilon_greedy(epsilon=1.0)\n", " else:\n", " action = agent.epsilon_greedy(epsilon)\n", "\n", " ttt.act(action)\n", " if verbose:\n", " print(ttt)\n", " if ttt.terminal_state():\n", " return ttt.reinforcement()\n", "\n", " def play_game_show_Q(self, epsilon=0.0):\n", " ttt = self.env\n", " agents = self.agents\n", " step = 0\n", "\n", " ttt.initialize()\n", " while True:\n", " agent = agents[ttt.player]\n", " obs = ttt.observe()\n", " actions = ttt.valid_actions()\n", " if len(ttt.valid_actions()) == 9:\n", " action = agent.epsilon_greedy(epsilon=1.0)\n", " else:\n", " action = agent.epsilon_greedy(epsilon)\n", " ttt.act(action)\n", " step += 1\n", "\n", " plt.subplot(5, 2, step)\n", " Qs = np.array([agent.use(np.hstack((obs, a))) for a in actions])\n", " board_image = np.array([np.nan] * 9)\n", " for Q, a in zip(Qs, actions):\n", " board_image[a] = Q[0, 0]\n", " board_image = board_image.reshape(3, 3)\n", " maxmag = np.nanmax(np.abs(board_image))\n", " plt.imshow(board_image, cmap='coolwarm', vmin=-maxmag, vmax=maxmag)\n", " plt.colorbar()\n", " obs = ttt.observe()\n", " i = -1\n", " for row in range(3):\n", " for col in range(3):\n", " i += 1\n", " if obs[i] == 1:\n", " plt.text(col, row, 'X', ha='center',\n", " fontweight='bold', fontsize='large', color='black')\n", " elif obs[i] == -1:\n", " plt.text(col, row, 'O', ha='center',\n", " fontweight='bold', fontsize='large', color='black')\n", " plt.axis('off')\n", " if ttt.terminal_state():\n", " break\n", "\n", " plt.tight_layout()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T20:09:04.931597Z", "start_time": "2024-10-24T20:09:04.913948Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " X| | \n", " -----\n", " | | \n", " ------\n", " | | \n", "\n", " X| | \n", " -----\n", " O| | \n", " ------\n", " | | \n", "\n", " X|X| \n", " -----\n", " O| | \n", " ------\n", " | | \n", "\n", " X|X| \n", " -----\n", " O|O| \n", " ------\n", " | | \n", "\n", " X|X|X\n", " -----\n", " O|O| \n", " ------\n", " | | \n" ] }, { "data": { "text/plain": [ "1" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ttt = tictactoe.TicTacToe()\n", "nh = [10]\n", "agents = {'X': tictactoe.QnetAgent(ttt, nh, 'max'), \n", " 'O': tictactoe.QnetAgent(ttt, nh, 'min')}\n", "game = Game(ttt, agents)\n", "game.play_game(0)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T20:58:21.183792Z", "start_time": "2024-10-24T20:38:08.889535Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " hiddens batches games epochs mean r\n", "43 [50] 500 5 10 0.4244\n", "41 [50, 50, 50] 500 5 5 0.4168\n", "2 [50, 50, 50] 100 5 2 0.3840\n", "65 [50, 50, 50] 500 20 5 0.3734\n", "61 [50] 500 20 2 0.3726\n", ".. ... ... ... ... ...\n", "56 [50, 50, 50] 500 10 10 0.1780\n", "30 [] 100 20 10 0.1750\n", "33 [] 100 20 20 0.1575\n", "15 [] 100 10 5 0.1520\n", "68 [50, 50, 50] 500 20 10 0.1440\n", "\n", "[72 rows x 5 columns]\n" ] } ], "source": [ "previous_best = -np.inf\n", "results = []\n", "for nb in [100, 500]:\n", " for ng in [5, 10, 20]:\n", " for ne in [2, 5, 10, 20]:\n", " for nh in [ [], [50], [50, 50, 50] ]:\n", " parms = {\n", " 'n_batches': nb,\n", " 'n_games_per_batch': ng,\n", " 'n_epochs': ne,\n", " 'method': 'scg',\n", " 'learning_rate': 0.01,\n", " 'initial_epsilon': 1.0,\n", " 'final_epsilon': 0.01,\n", " 'gamma': 1.0\n", " }\n", " agents = {'X': tictactoe.QnetAgent(ttt, nh, 'max'), \n", " 'O': tictactoe.QnetAgent(ttt, nh, 'min')}\n", " game = Game(ttt, agents)\n", "\n", " outcomes, _ = game.train(parms, verbose=False)\n", " mean_outcomes = np.mean(outcomes)\n", " results.append([nh, nb, ng, ne, mean_outcomes])\n", " clear_output()\n", " df = pd.DataFrame(results, \n", " columns=('hiddens', 'batches', 'games', \n", " 'epochs', 'mean r'))\n", " print(df.sort_values(by='mean r', ascending=False))\n", " \n", " if mean_outcomes > previous_best:\n", " previous_best = mean_outcomes\n", " with open('best_ttt_agents.pkl', 'wb') as f:\n", " pickle.dump(agents, f)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:11:44.934049Z", "start_time": "2024-10-24T21:11:44.713375Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mean of final outcomes 0.35\n" ] } ], "source": [ "with open('best_ttt_agents.pkl', 'rb') as f:\n", " agents = pickle.load(f)\n", "\n", "ttt = agents['X'].env\n", "game = Game(ttt, agents)\n", "\n", "rs = []\n", "for n_games in range(100):\n", " rs.append(game.play_game(epsilon=0.05, verbose=False))\n", " \n", "print(f'mean of final outcomes {np.mean(rs)}')" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:11:46.330430Z", "start_time": "2024-10-24T21:11:45.985386Z" }, "tags": [] }, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "game.play_game_show_Q()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Robot" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:24:11.306077Z", "start_time": "2024-10-24T21:24:10.532760Z" }, "tags": [] }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from IPython.display import display, clear_output\n", "import pandas as pd\n", "import pickle\n", "\n", "import robot" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:24:11.484837Z", "start_time": "2024-10-24T21:24:11.475874Z" }, "tags": [] }, "outputs": [], "source": [ " class Experiment:\n", "\n", " def __init__(self, environment, agent):\n", "\n", " self.env = environment\n", " self.agent = agent\n", "\n", " def train(self, parms, verbose=True):\n", "\n", " n_batches = parms['n_batches']\n", " n_steps_per_batch = parms['n_steps_per_batch']\n", " n_epochs = parms['n_epochs']\n", " method = parms['method']\n", " learning_rate = parms['learning_rate']\n", " epsilon = parms['initial_epsilon']\n", " final_epsilon = parms['final_epsilon']\n", " gamma = parms['gamma']\n", "\n", " env = self.env\n", "\n", " epsilon_decay = np.exp((np.log(final_epsilon) - np.log(epsilon))/ (n_batches)) # to produce this final value\n", "\n", " epsilon_trace = []\n", " outcomes = []\n", "\n", " for batch in range(n_batches):\n", " agent.clear_samples()\n", " env.initialize()\n", "\n", " sum_rs = 0\n", " \n", " for step in range(n_steps_per_batch):\n", "\n", " obs = self.env.observe()\n", " action = agent.epsilon_greedy(epsilon)\n", "\n", " env.act(action)\n", " r = env.reinforcement()\n", " sum_rs += r\n", "\n", " done = step == n_steps_per_batch - 1\n", " agent.add_sample(obs, action, r, done)\n", "\n", " outcomes.append(sum_rs / n_steps_per_batch)\n", "\n", " self.agent.train(n_epochs, method, learning_rate, gamma)\n", "\n", " epsilon_trace.append(epsilon)\n", " epsilon *= epsilon_decay\n", "\n", " if verbose and (len(outcomes) % (n_batches // 20) == 0):\n", " print(f'{len(outcomes)}/{n_batches} batches, {np.mean(outcomes):.4f} outcome mean')\n", "\n", " if verbose:\n", " plt.figure(1)\n", " plt.clf()\n", " plt.subplot(2, 1, 1)\n", " n_per = 10\n", " n_bins = len(outcomes) // n_per\n", " outcomes_binned = np.array(outcomes).reshape(-1, n_per)\n", " avgs = outcomes_binned.mean(1)\n", " xs = np.linspace(n_per, n_per * n_bins, len(avgs))\n", " plt.plot(xs, avgs)\n", " plt.axhline(y=0, color='orange', ls='--')\n", " plt.ylabel('R')\n", " \n", " plt.subplot(2, 1, 2)\n", " plt.plot(epsilon_trace)\n", " plt.ylabel(r'$\\epsilon$')\n", " #plt.pause(0.1)\n", "\n", " return outcomes # , epsilon_trace\n", "\n", " def test(self, n_trials, n_steps, epsilon=0.0, graphics=True):\n", " if graphics:\n", " fig = plt.figure(figsize=(10, 10))\n", " robot = self.env\n", " sum_rs = 0\n", " for trial in range(n_trials):\n", " robot.initialize()\n", " agent = self.agent\n", " points = np.zeros((n_steps, robot.n_links + 1, 2))\n", " actions = np.zeros((n_steps, robot.n_links))\n", " Q_values = np.zeros((n_steps))\n", " \n", " for i in range(n_steps):\n", " action = agent.epsilon_greedy(epsilon)\n", " Q = agent.use(np.hstack((robot.observe(), action)))\n", " self.env.act(action)\n", " sum_rs += self.env.reinforcement()\n", " points[i] = robot.points\n", " actions[i] = action\n", " Q_values[i] = Q[0, 0]\n", " \n", " if graphics:\n", " Q_min, Q_max = np.min(Q_values), np.max(Q_values)\n", " print(Q_min, Q_max)\n", " for i in range(n_steps):\n", " fig.clf()\n", " plt.scatter(robot.goal[0], robot.goal[1], s=40, c='blue')\n", " action = actions[i]\n", " robot.set_points(points[i])\n", " robot.draw() # alpha=(Q_values[i] - Q_min) / (Q_max - Q_min))\n", " clear_output(wait=True)\n", " display(fig)\n", " \n", " clear_output(wait=True)\n", " return sum_rs / (n_trials * n_steps)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:25:13.752688Z", "start_time": "2024-10-24T21:25:13.748958Z" }, "tags": [] }, "outputs": [], "source": [ "robbie = robot.Robot()\n", "robbie.set_goal([5., 5.])\n", "\n", "agent = robot.QnetAgent(robbie, [100, 100, 100])\n", "experiment = Experiment(robbie, agent)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:30:14.566512Z", "start_time": "2024-10-24T21:26:42.489755Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " hiddens batches steps epochs dist\n", "13 [50] 200 50 5 -1.814330\n", "17 [50, 50] 200 50 10 -4.569909\n", "1 [50] 100 50 5 -5.063843\n", "16 [50] 200 50 10 -5.805537\n", "5 [50, 50] 100 50 10 -5.866438\n", "4 [50] 100 50 10 -7.114635\n", "18 [] 200 100 5 -7.848883\n", "7 [50] 100 100 5 -7.996590\n", "6 [] 100 100 5 -8.317250\n", "0 [] 100 50 5 -8.321376\n", "3 [] 100 50 10 -8.546975\n", "10 [50] 100 100 10 -8.749536\n", "12 [] 200 50 5 -8.816075\n", "2 [50, 50] 100 50 5 -8.881165\n", "15 [] 200 50 10 -9.246151\n", "9 [] 100 100 10 -9.302054\n", "11 [50, 50] 100 100 10 -10.096282\n", "8 [50, 50] 100 100 5 -10.254745\n", "14 [50, 50] 200 50 5 -10.688630\n", "\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[11], line 23\u001b[0m\n\u001b[1;32m 20\u001b[0m agent \u001b[38;5;241m=\u001b[39m robot\u001b[38;5;241m.\u001b[39mQnetAgent(robbie, nh)\n\u001b[1;32m 21\u001b[0m experiment \u001b[38;5;241m=\u001b[39m Experiment(robbie, agent)\n\u001b[0;32m---> 23\u001b[0m outcomes \u001b[38;5;241m=\u001b[39m experiment\u001b[38;5;241m.\u001b[39mtrain(parms, verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 24\u001b[0m results\u001b[38;5;241m.\u001b[39mappend([nh, nb, ns, ne, outcomes[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]])\n\u001b[1;32m 25\u001b[0m clear_output()\n", "Cell \u001b[0;32mIn[2], line 35\u001b[0m, in \u001b[0;36mExperiment.train\u001b[0;34m(self, parms, verbose)\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n_steps_per_batch):\n\u001b[1;32m 34\u001b[0m obs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv\u001b[38;5;241m.\u001b[39mobserve()\n\u001b[0;32m---> 35\u001b[0m action \u001b[38;5;241m=\u001b[39m agent\u001b[38;5;241m.\u001b[39mepsilon_greedy(epsilon)\n\u001b[1;32m 37\u001b[0m env\u001b[38;5;241m.\u001b[39mact(action)\n\u001b[1;32m 38\u001b[0m r \u001b[38;5;241m=\u001b[39m env\u001b[38;5;241m.\u001b[39mreinforcement()\n", "File \u001b[0;32m~/public_html/cs545/notebooks/tmp/robot.py:119\u001b[0m, in \u001b[0;36mQnetAgent.epsilon_greedy\u001b[0;34m(self, epsilon)\u001b[0m\n\u001b[1;32m 117\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mshuffle(actions)\n\u001b[1;32m 118\u001b[0m obs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv\u001b[38;5;241m.\u001b[39mobserve()\n\u001b[0;32m--> 119\u001b[0m Qs \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray([\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse(np\u001b[38;5;241m.\u001b[39mhstack((obs, a))) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m actions])\n\u001b[1;32m 120\u001b[0m action \u001b[38;5;241m=\u001b[39m actions[np\u001b[38;5;241m.\u001b[39margmax(Qs)] \u001b[38;5;66;03m# Minimize sum of distances to goal\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m action\n", "File \u001b[0;32m~/public_html/cs545/notebooks/tmp/robot.py:126\u001b[0m, in \u001b[0;36mQnetAgent.use\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 125\u001b[0m X \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mQnet\u001b[38;5;241m.\u001b[39muse(X)\n", "File \u001b[0;32m~/public_html/cs545/notebooks/tmp/neuralnetworksA4.py:410\u001b[0m, in \u001b[0;36mNeuralNetwork.use\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[38;5;66;03m# Standardize X\u001b[39;00m\n\u001b[1;32m 409\u001b[0m X \u001b[38;5;241m=\u001b[39m (X \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mX_means) \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mX_stds\n\u001b[0;32m--> 410\u001b[0m Zs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward(X)\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# Unstandardize output Y before returning it\u001b[39;00m\n\u001b[1;32m 412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Zs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mT_stds \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mT_means\n", "File \u001b[0;32m~/public_html/cs545/notebooks/tmp/neuralnetworksA4.py:301\u001b[0m, in \u001b[0;36mNeuralNetwork._forward\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[38;5;66;03m# Append output of each layer to list in self.Zs, then return it.\u001b[39;00m\n\u001b[1;32m 300\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m W \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mWs[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]: \u001b[38;5;66;03m# forward through all but last layer\u001b[39;00m\n\u001b[0;32m--> 301\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mZs\u001b[38;5;241m.\u001b[39mappend(np\u001b[38;5;241m.\u001b[39mtanh(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_add_ones(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mZs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]) \u001b[38;5;241m@\u001b[39m W))\n\u001b[1;32m 302\u001b[0m last_W \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mWs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mZs\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_add_ones(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mZs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]) \u001b[38;5;241m@\u001b[39m last_W)\n", "File \u001b[0;32m~/public_html/cs545/notebooks/tmp/neuralnetworksA4.py:282\u001b[0m, in \u001b[0;36mNeuralNetwork._add_ones\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_add_ones\u001b[39m(\u001b[38;5;28mself\u001b[39m, X):\n\u001b[0;32m--> 282\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39minsert(X, \u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n", "File \u001b[0;32m~/anaconda3/lib/python3.12/site-packages/numpy/lib/function_base.py:5369\u001b[0m, in \u001b[0;36minsert\u001b[0;34m(arr, obj, values, axis)\u001b[0m\n\u001b[1;32m 5365\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_insert_dispatcher\u001b[39m(arr, obj, values, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 5366\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (arr, obj, values)\n\u001b[0;32m-> 5369\u001b[0m \u001b[38;5;129m@array_function_dispatch\u001b[39m(_insert_dispatcher)\n\u001b[1;32m 5370\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minsert\u001b[39m(arr, obj, values, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 5371\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 5372\u001b[0m \u001b[38;5;124;03m Insert values along the given axis before the given indices.\u001b[39;00m\n\u001b[1;32m 5373\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 5456\u001b[0m \n\u001b[1;32m 5457\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 5458\u001b[0m wrap \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import pandas as pd\n", "\n", "previous_best = -np.inf\n", "\n", "results = []\n", "for nb in [100, 200, 5000]:\n", " for ns in [50, 100]:\n", " for ne in [5, 10]:\n", " for nh in [ [], [50], [50, 50] ]:\n", " parms = {\n", " 'n_batches': nb,\n", " 'n_steps_per_batch': ns,\n", " 'n_epochs': ne,\n", " 'method': 'scg',\n", " 'learning_rate': 0.01,\n", " 'initial_epsilon': 1.0,\n", " 'final_epsilon': 0.001,\n", " 'gamma': 1.0\n", " }\n", " agent = robot.QnetAgent(robbie, nh)\n", " experiment = Experiment(robbie, agent)\n", " \n", " outcomes = experiment.train(parms, verbose=False)\n", " results.append([nh, nb, ns, ne, outcomes[-1]])\n", " clear_output()\n", " df = pd.DataFrame(results, \n", " columns=('hiddens', 'batches', 'steps', \n", " 'epochs', 'dist'))\n", " print(df.sort_values(by='dist', ascending=False))\n", " \n", " if outcomes[-1] > previous_best:\n", " previous_best = outcomes[-1]\n", " with open('best_robot_agent.pkl', 'wb') as f:\n", " pickle.dump(agent, f)\n", " \n", " print()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:30:22.401149Z", "start_time": "2024-10-24T21:30:21.002316Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mean of reinforcements -2.424\n" ] } ], "source": [ "with open('best_robot_agent.pkl', 'rb') as f:\n", " agent = pickle.load(f)\n", "\n", "robbie = agent.env\n", "experiment = Experiment(robbie, agent)\n", "\n", "mean_r = experiment.test(n_trials=10, n_steps=100, epsilon=0.0, graphics=False)\n", "\n", "print(f'mean of reinforcements {mean_r:.3f}')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:31:50.619335Z", "start_time": "2024-10-24T21:31:18.194615Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25/500 batches, -7.9906 outcome mean\n", "50/500 batches, -7.8550 outcome mean\n", "75/500 batches, -7.6936 outcome mean\n", "100/500 batches, -7.4452 outcome mean\n", "125/500 batches, -7.2722 outcome mean\n", "150/500 batches, -7.0897 outcome mean\n", "175/500 batches, -6.8215 outcome mean\n", "200/500 batches, -6.7659 outcome mean\n", "225/500 batches, -6.6292 outcome mean\n", "250/500 batches, -6.4414 outcome mean\n", "275/500 batches, -6.2562 outcome mean\n", "300/500 batches, -6.1209 outcome mean\n", "325/500 batches, -5.9951 outcome mean\n", "350/500 batches, -5.9060 outcome mean\n", "375/500 batches, -5.8354 outcome mean\n", "400/500 batches, -5.7448 outcome mean\n", "425/500 batches, -5.6290 outcome mean\n", "450/500 batches, -5.5753 outcome mean\n", "475/500 batches, -5.5057 outcome mean\n", "500/500 batches, -5.4142 outcome mean\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "parms = {\n", " 'n_batches': 500,\n", " 'n_steps_per_batch': 50, \n", " 'n_epochs': 5,\n", " 'method': 'scg',\n", " 'learning_rate': 0.01,\n", " 'initial_epsilon': 1.0,\n", " 'final_epsilon': 0.0001,\n", " 'gamma': 1.0\n", "}\n", "\n", "agent = robot.QnetAgent(robbie, [50])\n", "experiment = Experiment(robbie, agent)\n", "\n", "outcomes = experiment.train(parms)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2024-10-24T21:32:50.861380Z", "start_time": "2024-10-24T21:32:02.147369Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "-3.4894459784287455" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "experiment.test(10, 100, epsilon=0.0, graphics=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }