Lab files for Windows 10

2022-11-28 15:27:10 -05:00
parent e3624cbe8a
commit e8554d8d03
4 changed files with 556 additions and 0 deletions
--- a/Intelligence/Week
+++ b/Intelligence/Week
--- a/Files/keras_io_dqn_save_weights_v1.py
+++ b/Files/keras_io_dqn_save_weights_v1.py
@ -0,0 +1,270 @@
+## This entire file has been adapted from code
+## by  Jacob Chapman and Mathias Lechner, available here as of 
+## 11/11/2021
+## https://github.com/keras-team/keras-io/blob/master/examples/rl/deep_q_network_breakout.py
+## Changes made:
+## * use breakwall version of breakout instead of atari
+## * log in tensorboard compatible format and print logs
+## * save weights of model each time moving episodic reward reaches a new max
+
+from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import datetime
+
+# logging code
+# for tensorboard
+# https://www.tensorflow.org/tensorboard/get_started
+current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+# train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
+# train_summary_writer = tf.summary.create_file_writer(train_log_dir)
+# end of  logging code
+# for tensorboard
+
+# Configuration paramaters for the whole setup
+seed = 42
+gamma = 0.99  # Discount factor for past rewards
+epsilon = 1.0  # Epsilon greedy parameter
+epsilon_min = 0.1  # Minimum epsilon greedy parameter
+epsilon_max = 1.0  # Maximum epsilon greedy parameter
+epsilon_interval = (
+    epsilon_max - epsilon_min
+)  # Rate at which to reduce chance of random action being taken
+batch_size = 32  # Size of batch taken from replay buffer
+max_steps_per_episode = 10000
+
+#env_name = "BreakoutNoFrameskip-v4" # for windows?
+env_name = "gym_gs:BreakwallNoFrameskip-v1"
+#env_name2 = "BreakoutNoFrameskip-v4" # for windows?
+env_name2 = "gym_gs_BreakwallNoFrameskip-v1"
+
+# Use the Baseline Atari environment because of Deepmind helper functions
+env = make_atari(env_name)
+
+# Warp the frames, grey scale, stake four frame and scale to smaller ratio
+env = wrap_deepmind(env, frame_stack=True, scale=True)
+env.seed(seed)
+
+"""
+## Implement the Deep Q-Network
+
+This network learns an approximation of the Q-table, which is a mapping between
+the states ainnd actions that an agent will take. For every state we'll have four
+actions, that can be taken. The environment provides the state, and the action
+is chosen by selecting the larger of the four Q-values predicted in the output layer.
+"""
+
+num_actions = 4
+
+
+def log(running_reward, last_reward, episode, mem_perc, epsilon, frame, tensorboard_log = False):
+    """
+    log the running episodic reward, most recent reward, 
+    episode count, epsilon value and frame count plus mem_perc which
+    is the percentage of the action memory that is full
+    """
+    if tensorboard_log: 
+        with train_summary_writer.as_default():
+            tf.summary.scalar('running reward', running_reward, step=episode)
+
+    template = 'Epoch,{}, Mem,{}%, Eps,{}, Frame,{}, Last reward:,{}, Running reward:,{}, '
+    print (template.format(episode+1,
+                    np.round(mem_perc, 3), 
+                    np.round(epsilon, 3), 
+                    frame,
+                    last_reward, 
+                    running_reward))
+
+def save_weights(env_name2, model, episode, run_id):
+    """
+    save the weights of the sent model 
+    with the env_name, episode and run_id used 
+    to gneerate the filename
+    """
+    print("Saving weights")
+    path = "./saves/"
+    model.save_weights(path + env_name2 + "_" + str(run_id)+"_"+str(episode))
+
+
+def create_q_model():
+    # Network defined by the Deepmind paper
+    inputs = layers.Input(shape=(84, 84, 4,))
+    # Convolutions on the frames on the screen
+    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
+    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
+    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)
+    layer4 = layers.Flatten()(layer3)
+    layer5 = layers.Dense(512, activation="relu")(layer4)    
+    action = layers.Dense(num_actions, activation="linear")(layer5)
+    return keras.Model(inputs=inputs, outputs=action)
+
+# The first model makes the predictions for Q-values which are used to
+# make a action.
+model = create_q_model()
+print(model.summary())
+# Build a target model for the prediction of future rewards.
+# The weights of a target model get updated every 10000 steps thus when the
+# loss between the Q-values is calculated the target Q-value is stable.
+model_target = create_q_model()
+
+"""
+## Train
+"""
+# In the Deepmind paper they use RMSProp however then Adam optimizer
+# improves training time
+optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
+
+# Experience replay buffers
+action_history = []
+state_history = []
+state_next_history = []
+rewards_history = []
+done_history = []
+episode_reward_history = []
+running_reward = 0
+episode_count = 0
+frame_count = 0
+# Number of frames to take random action and observe output
+epsilon_random_frames = 50000
+# Number of frames for exploration
+epsilon_greedy_frames = 1000000.0
+# Maximum replay length
+# Note: The Deepmind paper suggests 1000000 however this causes memory issues
+max_memory_length = 100000
+# Train the model after 4 actions
+update_after_actions = 4
+# How often to update the target network
+update_target_network = 10000
+# Using huber loss for stability
+loss_function = keras.losses.Huber()
+epoch = 0
+# use this to decide when it is time to save the weights
+max_reward = 0 
+
+while True:  # Run until solved
+    state = np.array(env.reset())
+    episode_reward = 0
+    print("Starting episode", episode_count, "played frames", frame_count)
+    for timestep in range(1, max_steps_per_episode): #10000
+        #env.render()# ; Adding this line would show the attempts
+        # of the agent in a pop up window.
+        if frame_count % 250 == 0:
+            log(np.mean(episode_reward_history), 
+                episode_reward, 
+                episode_count, 
+                len(rewards_history) / max_memory_length * 100, 
+                epsilon, 
+                frame_count)
+
+        frame_count += 1
+        # Use epsilon-greedy for exploration
+        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
+            # Take random action
+            action = np.random.choice(num_actions)
+        else:
+            # Predict action Q-values
+            # From environment state
+            state_tensor = tf.convert_to_tensor(state)
+            state_tensor = tf.expand_dims(state_tensor, 0)
+            action_probs = model(state_tensor, training=False)
+            # Take best action
+            action = tf.argmax(action_probs[0]).numpy()
+
+        # Decay probability of taking random action
+        epsilon -= epsilon_interval / epsilon_greedy_frames
+        epsilon = max(epsilon, epsilon_min)
+
+        # Apply the sampled action in our environment
+        state_next, reward, done, _ = env.step(action)
+        state_next = np.array(state_next)
+
+        #print("state shape:", state.shape)
+        episode_reward += reward
+
+        # Save actions and states in replay buffer
+        action_history.append(action)
+        state_history.append(state)
+        state_next_history.append(state_next)
+        done_history.append(done)
+        rewards_history.append(reward)
+        state = state_next
+
+        # Update every fourth frame and once batch size is over 32
+        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
+            #print("Training neural network")
+            # Get indices of samples for replay buffers
+            indices = np.random.choice(range(len(done_history)), size=batch_size)
+
+            # Using list comprehension to sample from replay buffer
+            state_sample = np.array([state_history[i] for i in indices])
+            state_next_sample = np.array([state_next_history[i] for i in indices])
+            rewards_sample = [rewards_history[i] for i in indices]
+            action_sample = [action_history[i] for i in indices]
+            done_sample = tf.convert_to_tensor(
+                [float(done_history[i]) for i in indices]
+            )
+
+            # Build the updated Q-values for the sampled future states
+            # Use the target model for stability
+            future_rewards = model_target.predict(state_next_sample)
+            # Q value = reward + discount factor * expected future reward
+            updated_q_values = rewards_sample + gamma * tf.reduce_max(
+                future_rewards, axis=1
+            )
+
+            # If final frame set the last value to -1
+            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
+
+            # Create a mask so we only calculate loss on the updated Q-values
+            masks = tf.one_hot(action_sample, num_actions)
+
+            with tf.GradientTape() as tape:
+                # Train the model on the states and updated Q-values
+                q_values = model(state_sample)
+
+                # Apply the masks to the Q-values to get the Q-value for action taken
+                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
+                # Calculate loss between new Q-value and old Q-value
+                loss = loss_function(updated_q_values, q_action)
+
+            # Backpropagation
+            grads = tape.gradient(loss, model.trainable_variables)
+            optimizer.apply_gradients(zip(grads, model.trainable_variables))
+            
+           
+        if frame_count % update_target_network == 0:
+            print("Updating q~")
+            # update the the target network with new weights
+            model_target.set_weights(model.get_weights())
+     
+            
+        # Limit the state and reward history
+        if len(rewards_history) > max_memory_length:
+            del rewards_history[:1]
+            del state_history[:1]
+            del state_next_history[:1]
+            del action_history[:1]
+            del done_history[:1]
+
+        if done:
+            break
+
+    # Update running reward to check condition for solving
+    episode_reward_history.append(episode_reward)
+    if len(episode_reward_history) > 100:
+        del episode_reward_history[:1]
+    running_reward = np.mean(episode_reward_history)
+
+    episode_count += 1
+
+    # save the weights if we've reached a new high
+    if running_reward > max_reward:
+        save_weights(env_name2, model, episode_count, current_time)
+        max_reward = running_reward
+
+    if running_reward > 75:  # Condition to consider the task solved
+        save_weights(env_name2, model, episode_count, current_time)
+        print("Solved at episode {}!".format(episode_count))
+        break
--- a/Files/run_pre_trained.ipynb
+++ b/Files/run_pre_trained.ipynb
@ -0,0 +1,209 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run a pre-trained model\n",
+    "\n",
+    "This notebook loads a pre-trained model and uses it to play games. \n",
+    "Note that it does not render the image of the game, it just prints out the episodic score. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pygame 2.1.2 (SDL 2.0.18, Python 3.10.8)\n",
+      "Hello from the pygame community. https://www.pygame.org/contribute.html\n"
+     ]
+    }
+   ],
+   "source": [
+    "# sanity check: can we create breakwall?\n",
+    "import gym\n",
+    "e = gym.make('gym_gs:BreakwallNoFrameskip-v1')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/openai/baselines.git\n",
+      "  Cloning https://github.com/openai/baselines.git to c:\\users\\gofor\\appdata\\local\\temp\\pip-req-build-s405pyio\n",
+      "  Resolved https://github.com/openai/baselines.git to commit ea25b9e8b234e6ee1bca43083f8f3cf974143998\n",
+      "  Preparing metadata (setup.py): started\n",
+      "  Preparing metadata (setup.py): finished with status 'done'\n",
+      "Requirement already satisfied: gym<0.16.0,>=0.15.4 in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (0.15.7)\n",
+      "Requirement already satisfied: scipy in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (1.9.3)\n",
+      "Requirement already satisfied: tqdm in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (4.64.1)\n",
+      "Requirement already satisfied: joblib in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (1.2.0)\n",
+      "Requirement already satisfied: cloudpickle in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (1.2.2)\n",
+      "Requirement already satisfied: click in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (8.1.3)\n",
+      "Requirement already satisfied: opencv-python in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (4.6.0.66)\n",
+      "Requirement already satisfied: numpy>=1.10.4 in c:\\users\\gofor\\myvenv\\lib\\site-packages (from gym<0.16.0,>=0.15.4->baselines==0.1.6) (1.23.5)\n",
+      "Requirement already satisfied: pyglet<=1.5.0,>=1.4.0 in c:\\users\\gofor\\myvenv\\lib\\site-packages (from gym<0.16.0,>=0.15.4->baselines==0.1.6) (1.5.0)\n",
+      "Requirement already satisfied: six in c:\\users\\gofor\\myvenv\\lib\\site-packages (from gym<0.16.0,>=0.15.4->baselines==0.1.6) (1.16.0)\n",
+      "Requirement already satisfied: colorama in c:\\users\\gofor\\myvenv\\lib\\site-packages (from click->baselines==0.1.6) (0.4.6)\n",
+      "Requirement already satisfied: future in c:\\users\\gofor\\myvenv\\lib\\site-packages (from pyglet<=1.5.0,>=1.4.0->gym<0.16.0,>=0.15.4->baselines==0.1.6) (0.18.2)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  Running command git clone --filter=blob:none --quiet https://github.com/openai/baselines.git 'C:\\Users\\gofor\\AppData\\Local\\Temp\\pip-req-build-s405pyio'\n",
+      "\n",
+      "[notice] A new release of pip available: 22.2.2 -> 22.3.1\n",
+      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
+     ]
+    }
+   ],
+   "source": [
+    "# install baselines and other stuff\n",
+    "!pip install git+https://github.com/openai/baselines.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded gym\n",
+      "Model weights look loadable ./pre-trained/mac_hard_breakwall/gym_gs_BreakwallNoFrameskip-v1_20211018-114642_5424.data-00000-of-00001\n",
+      "Model loaded weights - starting sim\n",
+      "Game over at frame 278 rew 2.0 rewards/frame:  0.007194244604316547\n",
+      "Game over at frame 453 rew 3.0 rewards/frame:  0.006622516556291391\n",
+      "Game over at frame 631 rew 4.0 rewards/frame:  0.006339144215530904\n",
+      "Game over at frame 906 rew 6.0 rewards/frame:  0.006622516556291391\n",
+      "Game over at frame 976 rew 6.0 rewards/frame:  0.006147540983606557\n",
+      "Sim ended : rew is  6.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "## full check - can we use the full opencv/ openai version \n",
+    "## of the gym?\n",
+    "\n",
+    "# Script to test a pre-trained model\n",
+    "# Written by Matthew Yee-King\n",
+    "# MIT license \n",
+    "# https://mit-license.org/\n",
+    "\n",
+    "import sys\n",
+    "import os\n",
+    "from baselines.common.atari_wrappers import make_atari, wrap_deepmind\n",
+    "import numpy as np\n",
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "import datetime\n",
+    "import random\n",
+    "import time \n",
+    "\n",
+    "env_name = \"gym_gs:BreakwallNoFrameskip-v1\" \n",
+    "# for notebook users - make sure you have uploaded your pre-trained\n",
+    "# models... then adjust this to reflect the file path\n",
+    "model_file = \"./pre-trained/mac_hard_breakwall/gym_gs_BreakwallNoFrameskip-v1_20211018-114642_5424\"\n",
+    "\n",
+    "def create_q_model(num_actions):\n",
+    "    # Network defined by the Deepmind paper\n",
+    "    inputs = layers.Input(shape=(84, 84, 4,))\n",
+    "    # Convolutions on the frames on the screen\n",
+    "    layer1 = layers.Conv2D(32, 8, strides=4, activation=\"relu\")(inputs)    \n",
+    "    layer2 = layers.Conv2D(64, 4, strides=2, activation=\"relu\")(layer1)\n",
+    "    layer3 = layers.Conv2D(64, 3, strides=1, activation=\"relu\")(layer2)\n",
+    "    layer4 = layers.Flatten()(layer3)\n",
+    "    layer5 = layers.Dense(512, activation=\"relu\")(layer4)    \n",
+    "    action = layers.Dense(num_actions, activation=\"linear\")(layer5)    \n",
+    "    return keras.Model(inputs=inputs, outputs=action)\n",
+    "\n",
+    "def create_env(env_name, seed=42):\n",
+    "    try:\n",
+    "        # Use the Baseline Atari environment because of Deepmind helper functions\n",
+    "        env = make_atari(env_name)\n",
+    "        # Warp the frames, grey scale, stake four frame and scale to smaller ratio\n",
+    "        env = wrap_deepmind(env, frame_stack=True, scale=True)\n",
+    "        print(\"Loaded gym\")\n",
+    "        env.seed(seed)\n",
+    "        return env\n",
+    "    except:\n",
+    "        print(\"Failed to make gym env\", env_name)\n",
+    "        return None\n",
+    "\n",
+    "def run_sim(env, model, frame_count):\n",
+    "    state = np.array(env.reset())\n",
+    "    total_reward = 0\n",
+    "    for i in range(frame_count):\n",
+    "        # in the notebook version we cannot really \n",
+    "        # render in realtime, so you just have\n",
+    "        # to check the score :( \n",
+    "        env.render('human')\n",
+    "        state_tensor = keras.backend.constant(state)\n",
+    "        state_tensor = keras.backend.expand_dims(state_tensor, 0)\n",
+    "        action_values = model(state_tensor, training=False)\n",
+    "        # Take best action\n",
+    "        action = keras.backend.argmax(action_values[0]).numpy()\n",
+    "        state, reward, done, _ = env.step(action)\n",
+    "        state =  np.array(state)\n",
+    "        total_reward += reward\n",
+    "        if done:\n",
+    "            print(\"Game over at frame\", i, \"rew\", total_reward, \"rewards/frame: \", total_reward/i)\n",
+    "            env.reset()\n",
+    "            #break\n",
+    "        #time.sleep(0.1)\n",
+    "    print(\"Sim ended : rew is \", total_reward)\n",
+    "\n",
+    "def main(env_name, model_file,frame_count=1000,  seed=42):\n",
+    "    env = create_env(env_name=env_name)\n",
+    "    assert env is not None, \"Failed to make env \" + env_name\n",
+    "    model = create_q_model(num_actions=env.action_space.n)\n",
+    "    model_testfile = model_file + \".data-00000-of-00001\"\n",
+    "    assert os.path.exists(model_testfile), \"Failed to load model: \" + model_testfile\n",
+    "    print(\"Model weights look loadable\", model_testfile)\n",
+    "    model.load_weights(model_file)\n",
+    "    print(\"Model loaded weights - starting sim\")\n",
+    "    run_sim(env, model, frame_count)\n",
+    "        \n",
+    "main(env_name, model_file, frame_count=1000)\n",
+    "\n",
+    "# LEV"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/Files/run_pre_trained_v1.py
+++ b/Files/run_pre_trained_v1.py
@ -0,0 +1,77 @@
+# Script to test a pre-trained model
+# Written by Matthew Yee-King
+# MIT license 
+# https://mit-license.org/
+
+import sys
+import os
+from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import datetime
+import random
+import time 
+
+env_name = "gym_gs:BreakwallNoFrameskip-v1" 
+model_file = "./pre-trained/mac_hard_breakwall/gym_gs:BreakwallNoFrameskip-v1_20211018-114642_5424"
+
+def create_q_model(num_actions):
+    # Network defined by the Deepmind paper
+    inputs = layers.Input(shape=(84, 84, 4,))
+    # Convolutions on the frames on the screen
+    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)    
+    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
+    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)
+    layer4 = layers.Flatten()(layer3)
+    layer5 = layers.Dense(512, activation="relu")(layer4)    
+    action = layers.Dense(num_actions, activation="linear")(layer5)    
+    return keras.Model(inputs=inputs, outputs=action)
+
+def create_env(env_name, seed=42):
+    try:
+        # Use the Baseline Atari environment because of Deepmind helper functions
+        env = make_atari(env_name)
+        # Warp the frames, grey scale, stake four frame and scale to smaller ratio
+        env = wrap_deepmind(env, frame_stack=True, scale=True)
+        print("Loaded gym")
+        env.seed(seed)
+        return env
+    except:
+        print("Failed to make gym env", env_name)
+        return None
+
+def run_sim(env, model, frame_count):
+    state = np.array(env.reset())
+    total_reward = 0
+    for i in range(frame_count):
+        env.render('human')
+        state_tensor = keras.backend.constant(state)
+        state_tensor = keras.backend.expand_dims(state_tensor, 0)
+        action_values = model(state_tensor, training=False)
+        # Take best action
+        action = keras.backend.argmax(action_values[0]).numpy()
+        state, reward, done, _ = env.step(action)
+        state =  np.array(state)
+        total_reward += reward
+        if done:
+            print("Game over at frame", i, "rew", total_reward)
+            env.reset()
+            #break
+        #time.sleep(0.1)
+    print("Sim ended : rew is ", total_reward)
+
+def main(env_name, model_file,frame_count=1000,  seed=42):
+    env = create_env(env_name=env_name)
+    assert env is not None, "Failed to make env " + env_name
+    model = create_q_model(num_actions=env.action_space.n)
+    model_testfile = model_file + ".data-00000-of-00001"
+    assert os.path.exists(model_testfile), "Failed to load model: " + model_testfile
+    print("Model weights look loadable", model_testfile)
+    model.load_weights(model_file)
+    print("Model loaded weights - starting sim")
+    run_sim(env, model, frame_count)
+        
+main(env_name, model_file, frame_count=1000)
+