diff --git a/CM3020 Artificial Intelligence/Week 4/4.13 DQN Flowchart A.pdf b/CM3020 Artificial Intelligence/Week 4/4.13 DQN Flowchart A.pdf new file mode 100644 index 0000000..f8e3337 Binary files /dev/null and b/CM3020 Artificial Intelligence/Week 4/4.13 DQN Flowchart A.pdf differ diff --git a/CM3020 Artificial Intelligence/Week 4/Lab Files/keras_io_dqn_save_weights_v1.py b/CM3020 Artificial Intelligence/Week 4/Lab Files/keras_io_dqn_save_weights_v1.py new file mode 100644 index 0000000..d52b7aa --- /dev/null +++ b/CM3020 Artificial Intelligence/Week 4/Lab Files/keras_io_dqn_save_weights_v1.py @@ -0,0 +1,270 @@ +## This entire file has been adapted from code +## by Jacob Chapman and Mathias Lechner, available here as of +## 11/11/2021 +## https://github.com/keras-team/keras-io/blob/master/examples/rl/deep_q_network_breakout.py +## Changes made: +## * use breakwall version of breakout instead of atari +## * log in tensorboard compatible format and print logs +## * save weights of model each time moving episodic reward reaches a new max + +from baselines.common.atari_wrappers import make_atari, wrap_deepmind +import numpy as np +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +import datetime + +# logging code +# for tensorboard +# https://www.tensorflow.org/tensorboard/get_started +current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +# train_log_dir = 'logs/gradient_tape/' + current_time + '/train' +# train_summary_writer = tf.summary.create_file_writer(train_log_dir) +# end of logging code +# for tensorboard + +# Configuration paramaters for the whole setup +seed = 42 +gamma = 0.99 # Discount factor for past rewards +epsilon = 1.0 # Epsilon greedy parameter +epsilon_min = 0.1 # Minimum epsilon greedy parameter +epsilon_max = 1.0 # Maximum epsilon greedy parameter +epsilon_interval = ( + epsilon_max - epsilon_min +) # Rate at which to reduce chance of random action being taken +batch_size = 32 # Size of batch taken from replay buffer +max_steps_per_episode = 10000 + +#env_name = "BreakoutNoFrameskip-v4" # for windows? +env_name = "gym_gs:BreakwallNoFrameskip-v1" +#env_name2 = "BreakoutNoFrameskip-v4" # for windows? +env_name2 = "gym_gs_BreakwallNoFrameskip-v1" + +# Use the Baseline Atari environment because of Deepmind helper functions +env = make_atari(env_name) + +# Warp the frames, grey scale, stake four frame and scale to smaller ratio +env = wrap_deepmind(env, frame_stack=True, scale=True) +env.seed(seed) + +""" +## Implement the Deep Q-Network + +This network learns an approximation of the Q-table, which is a mapping between +the states ainnd actions that an agent will take. For every state we'll have four +actions, that can be taken. The environment provides the state, and the action +is chosen by selecting the larger of the four Q-values predicted in the output layer. +""" + +num_actions = 4 + + +def log(running_reward, last_reward, episode, mem_perc, epsilon, frame, tensorboard_log = False): + """ + log the running episodic reward, most recent reward, + episode count, epsilon value and frame count plus mem_perc which + is the percentage of the action memory that is full + """ + if tensorboard_log: + with train_summary_writer.as_default(): + tf.summary.scalar('running reward', running_reward, step=episode) + + template = 'Epoch,{}, Mem,{}%, Eps,{}, Frame,{}, Last reward:,{}, Running reward:,{}, ' + print (template.format(episode+1, + np.round(mem_perc, 3), + np.round(epsilon, 3), + frame, + last_reward, + running_reward)) + +def save_weights(env_name2, model, episode, run_id): + """ + save the weights of the sent model + with the env_name, episode and run_id used + to gneerate the filename + """ + print("Saving weights") + path = "./saves/" + model.save_weights(path + env_name2 + "_" + str(run_id)+"_"+str(episode)) + + +def create_q_model(): + # Network defined by the Deepmind paper + inputs = layers.Input(shape=(84, 84, 4,)) + # Convolutions on the frames on the screen + layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs) + layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1) + layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2) + layer4 = layers.Flatten()(layer3) + layer5 = layers.Dense(512, activation="relu")(layer4) + action = layers.Dense(num_actions, activation="linear")(layer5) + return keras.Model(inputs=inputs, outputs=action) + +# The first model makes the predictions for Q-values which are used to +# make a action. +model = create_q_model() +print(model.summary()) +# Build a target model for the prediction of future rewards. +# The weights of a target model get updated every 10000 steps thus when the +# loss between the Q-values is calculated the target Q-value is stable. +model_target = create_q_model() + +""" +## Train +""" +# In the Deepmind paper they use RMSProp however then Adam optimizer +# improves training time +optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0) + +# Experience replay buffers +action_history = [] +state_history = [] +state_next_history = [] +rewards_history = [] +done_history = [] +episode_reward_history = [] +running_reward = 0 +episode_count = 0 +frame_count = 0 +# Number of frames to take random action and observe output +epsilon_random_frames = 50000 +# Number of frames for exploration +epsilon_greedy_frames = 1000000.0 +# Maximum replay length +# Note: The Deepmind paper suggests 1000000 however this causes memory issues +max_memory_length = 100000 +# Train the model after 4 actions +update_after_actions = 4 +# How often to update the target network +update_target_network = 10000 +# Using huber loss for stability +loss_function = keras.losses.Huber() +epoch = 0 +# use this to decide when it is time to save the weights +max_reward = 0 + +while True: # Run until solved + state = np.array(env.reset()) + episode_reward = 0 + print("Starting episode", episode_count, "played frames", frame_count) + for timestep in range(1, max_steps_per_episode): #10000 + #env.render()# ; Adding this line would show the attempts + # of the agent in a pop up window. + if frame_count % 250 == 0: + log(np.mean(episode_reward_history), + episode_reward, + episode_count, + len(rewards_history) / max_memory_length * 100, + epsilon, + frame_count) + + frame_count += 1 + # Use epsilon-greedy for exploration + if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]: + # Take random action + action = np.random.choice(num_actions) + else: + # Predict action Q-values + # From environment state + state_tensor = tf.convert_to_tensor(state) + state_tensor = tf.expand_dims(state_tensor, 0) + action_probs = model(state_tensor, training=False) + # Take best action + action = tf.argmax(action_probs[0]).numpy() + + # Decay probability of taking random action + epsilon -= epsilon_interval / epsilon_greedy_frames + epsilon = max(epsilon, epsilon_min) + + # Apply the sampled action in our environment + state_next, reward, done, _ = env.step(action) + state_next = np.array(state_next) + + #print("state shape:", state.shape) + episode_reward += reward + + # Save actions and states in replay buffer + action_history.append(action) + state_history.append(state) + state_next_history.append(state_next) + done_history.append(done) + rewards_history.append(reward) + state = state_next + + # Update every fourth frame and once batch size is over 32 + if frame_count % update_after_actions == 0 and len(done_history) > batch_size: + #print("Training neural network") + # Get indices of samples for replay buffers + indices = np.random.choice(range(len(done_history)), size=batch_size) + + # Using list comprehension to sample from replay buffer + state_sample = np.array([state_history[i] for i in indices]) + state_next_sample = np.array([state_next_history[i] for i in indices]) + rewards_sample = [rewards_history[i] for i in indices] + action_sample = [action_history[i] for i in indices] + done_sample = tf.convert_to_tensor( + [float(done_history[i]) for i in indices] + ) + + # Build the updated Q-values for the sampled future states + # Use the target model for stability + future_rewards = model_target.predict(state_next_sample) + # Q value = reward + discount factor * expected future reward + updated_q_values = rewards_sample + gamma * tf.reduce_max( + future_rewards, axis=1 + ) + + # If final frame set the last value to -1 + updated_q_values = updated_q_values * (1 - done_sample) - done_sample + + # Create a mask so we only calculate loss on the updated Q-values + masks = tf.one_hot(action_sample, num_actions) + + with tf.GradientTape() as tape: + # Train the model on the states and updated Q-values + q_values = model(state_sample) + + # Apply the masks to the Q-values to get the Q-value for action taken + q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1) + # Calculate loss between new Q-value and old Q-value + loss = loss_function(updated_q_values, q_action) + + # Backpropagation + grads = tape.gradient(loss, model.trainable_variables) + optimizer.apply_gradients(zip(grads, model.trainable_variables)) + + + if frame_count % update_target_network == 0: + print("Updating q~") + # update the the target network with new weights + model_target.set_weights(model.get_weights()) + + + # Limit the state and reward history + if len(rewards_history) > max_memory_length: + del rewards_history[:1] + del state_history[:1] + del state_next_history[:1] + del action_history[:1] + del done_history[:1] + + if done: + break + + # Update running reward to check condition for solving + episode_reward_history.append(episode_reward) + if len(episode_reward_history) > 100: + del episode_reward_history[:1] + running_reward = np.mean(episode_reward_history) + + episode_count += 1 + + # save the weights if we've reached a new high + if running_reward > max_reward: + save_weights(env_name2, model, episode_count, current_time) + max_reward = running_reward + + if running_reward > 75: # Condition to consider the task solved + save_weights(env_name2, model, episode_count, current_time) + print("Solved at episode {}!".format(episode_count)) + break diff --git a/CM3020 Artificial Intelligence/Week 4/Lab Files/run_pre_trained.ipynb b/CM3020 Artificial Intelligence/Week 4/Lab Files/run_pre_trained.ipynb new file mode 100644 index 0000000..33f4d6f --- /dev/null +++ b/CM3020 Artificial Intelligence/Week 4/Lab Files/run_pre_trained.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run a pre-trained model\n", + "\n", + "This notebook loads a pre-trained model and uses it to play games. \n", + "Note that it does not render the image of the game, it just prints out the episodic score. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pygame 2.1.2 (SDL 2.0.18, Python 3.10.8)\n", + "Hello from the pygame community. https://www.pygame.org/contribute.html\n" + ] + } + ], + "source": [ + "# sanity check: can we create breakwall?\n", + "import gym\n", + "e = gym.make('gym_gs:BreakwallNoFrameskip-v1')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting git+https://github.com/openai/baselines.git\n", + " Cloning https://github.com/openai/baselines.git to c:\\users\\gofor\\appdata\\local\\temp\\pip-req-build-s405pyio\n", + " Resolved https://github.com/openai/baselines.git to commit ea25b9e8b234e6ee1bca43083f8f3cf974143998\n", + " Preparing metadata (setup.py): started\n", + " Preparing metadata (setup.py): finished with status 'done'\n", + "Requirement already satisfied: gym<0.16.0,>=0.15.4 in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (0.15.7)\n", + "Requirement already satisfied: scipy in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (1.9.3)\n", + "Requirement already satisfied: tqdm in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (4.64.1)\n", + "Requirement already satisfied: joblib in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (1.2.0)\n", + "Requirement already satisfied: cloudpickle in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (1.2.2)\n", + "Requirement already satisfied: click in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (8.1.3)\n", + "Requirement already satisfied: opencv-python in c:\\users\\gofor\\myvenv\\lib\\site-packages (from baselines==0.1.6) (4.6.0.66)\n", + "Requirement already satisfied: numpy>=1.10.4 in c:\\users\\gofor\\myvenv\\lib\\site-packages (from gym<0.16.0,>=0.15.4->baselines==0.1.6) (1.23.5)\n", + "Requirement already satisfied: pyglet<=1.5.0,>=1.4.0 in c:\\users\\gofor\\myvenv\\lib\\site-packages (from gym<0.16.0,>=0.15.4->baselines==0.1.6) (1.5.0)\n", + "Requirement already satisfied: six in c:\\users\\gofor\\myvenv\\lib\\site-packages (from gym<0.16.0,>=0.15.4->baselines==0.1.6) (1.16.0)\n", + "Requirement already satisfied: colorama in c:\\users\\gofor\\myvenv\\lib\\site-packages (from click->baselines==0.1.6) (0.4.6)\n", + "Requirement already satisfied: future in c:\\users\\gofor\\myvenv\\lib\\site-packages (from pyglet<=1.5.0,>=1.4.0->gym<0.16.0,>=0.15.4->baselines==0.1.6) (0.18.2)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " Running command git clone --filter=blob:none --quiet https://github.com/openai/baselines.git 'C:\\Users\\gofor\\AppData\\Local\\Temp\\pip-req-build-s405pyio'\n", + "\n", + "[notice] A new release of pip available: 22.2.2 -> 22.3.1\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "# install baselines and other stuff\n", + "!pip install git+https://github.com/openai/baselines.git" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded gym\n", + "Model weights look loadable ./pre-trained/mac_hard_breakwall/gym_gs_BreakwallNoFrameskip-v1_20211018-114642_5424.data-00000-of-00001\n", + "Model loaded weights - starting sim\n", + "Game over at frame 278 rew 2.0 rewards/frame: 0.007194244604316547\n", + "Game over at frame 453 rew 3.0 rewards/frame: 0.006622516556291391\n", + "Game over at frame 631 rew 4.0 rewards/frame: 0.006339144215530904\n", + "Game over at frame 906 rew 6.0 rewards/frame: 0.006622516556291391\n", + "Game over at frame 976 rew 6.0 rewards/frame: 0.006147540983606557\n", + "Sim ended : rew is 6.0\n" + ] + } + ], + "source": [ + "## full check - can we use the full opencv/ openai version \n", + "## of the gym?\n", + "\n", + "# Script to test a pre-trained model\n", + "# Written by Matthew Yee-King\n", + "# MIT license \n", + "# https://mit-license.org/\n", + "\n", + "import sys\n", + "import os\n", + "from baselines.common.atari_wrappers import make_atari, wrap_deepmind\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import layers\n", + "import datetime\n", + "import random\n", + "import time \n", + "\n", + "env_name = \"gym_gs:BreakwallNoFrameskip-v1\" \n", + "# for notebook users - make sure you have uploaded your pre-trained\n", + "# models... then adjust this to reflect the file path\n", + "model_file = \"./pre-trained/mac_hard_breakwall/gym_gs_BreakwallNoFrameskip-v1_20211018-114642_5424\"\n", + "\n", + "def create_q_model(num_actions):\n", + " # Network defined by the Deepmind paper\n", + " inputs = layers.Input(shape=(84, 84, 4,))\n", + " # Convolutions on the frames on the screen\n", + " layer1 = layers.Conv2D(32, 8, strides=4, activation=\"relu\")(inputs) \n", + " layer2 = layers.Conv2D(64, 4, strides=2, activation=\"relu\")(layer1)\n", + " layer3 = layers.Conv2D(64, 3, strides=1, activation=\"relu\")(layer2)\n", + " layer4 = layers.Flatten()(layer3)\n", + " layer5 = layers.Dense(512, activation=\"relu\")(layer4) \n", + " action = layers.Dense(num_actions, activation=\"linear\")(layer5) \n", + " return keras.Model(inputs=inputs, outputs=action)\n", + "\n", + "def create_env(env_name, seed=42):\n", + " try:\n", + " # Use the Baseline Atari environment because of Deepmind helper functions\n", + " env = make_atari(env_name)\n", + " # Warp the frames, grey scale, stake four frame and scale to smaller ratio\n", + " env = wrap_deepmind(env, frame_stack=True, scale=True)\n", + " print(\"Loaded gym\")\n", + " env.seed(seed)\n", + " return env\n", + " except:\n", + " print(\"Failed to make gym env\", env_name)\n", + " return None\n", + "\n", + "def run_sim(env, model, frame_count):\n", + " state = np.array(env.reset())\n", + " total_reward = 0\n", + " for i in range(frame_count):\n", + " # in the notebook version we cannot really \n", + " # render in realtime, so you just have\n", + " # to check the score :( \n", + " env.render('human')\n", + " state_tensor = keras.backend.constant(state)\n", + " state_tensor = keras.backend.expand_dims(state_tensor, 0)\n", + " action_values = model(state_tensor, training=False)\n", + " # Take best action\n", + " action = keras.backend.argmax(action_values[0]).numpy()\n", + " state, reward, done, _ = env.step(action)\n", + " state = np.array(state)\n", + " total_reward += reward\n", + " if done:\n", + " print(\"Game over at frame\", i, \"rew\", total_reward, \"rewards/frame: \", total_reward/i)\n", + " env.reset()\n", + " #break\n", + " #time.sleep(0.1)\n", + " print(\"Sim ended : rew is \", total_reward)\n", + "\n", + "def main(env_name, model_file,frame_count=1000, seed=42):\n", + " env = create_env(env_name=env_name)\n", + " assert env is not None, \"Failed to make env \" + env_name\n", + " model = create_q_model(num_actions=env.action_space.n)\n", + " model_testfile = model_file + \".data-00000-of-00001\"\n", + " assert os.path.exists(model_testfile), \"Failed to load model: \" + model_testfile\n", + " print(\"Model weights look loadable\", model_testfile)\n", + " model.load_weights(model_file)\n", + " print(\"Model loaded weights - starting sim\")\n", + " run_sim(env, model, frame_count)\n", + " \n", + "main(env_name, model_file, frame_count=1000)\n", + "\n", + "# LEV" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/CM3020 Artificial Intelligence/Week 4/Lab Files/run_pre_trained_v1.py b/CM3020 Artificial Intelligence/Week 4/Lab Files/run_pre_trained_v1.py new file mode 100644 index 0000000..5f27eca --- /dev/null +++ b/CM3020 Artificial Intelligence/Week 4/Lab Files/run_pre_trained_v1.py @@ -0,0 +1,77 @@ +# Script to test a pre-trained model +# Written by Matthew Yee-King +# MIT license +# https://mit-license.org/ + +import sys +import os +from baselines.common.atari_wrappers import make_atari, wrap_deepmind +import numpy as np +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +import datetime +import random +import time + +env_name = "gym_gs:BreakwallNoFrameskip-v1" +model_file = "./pre-trained/mac_hard_breakwall/gym_gs:BreakwallNoFrameskip-v1_20211018-114642_5424" + +def create_q_model(num_actions): + # Network defined by the Deepmind paper + inputs = layers.Input(shape=(84, 84, 4,)) + # Convolutions on the frames on the screen + layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs) + layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1) + layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2) + layer4 = layers.Flatten()(layer3) + layer5 = layers.Dense(512, activation="relu")(layer4) + action = layers.Dense(num_actions, activation="linear")(layer5) + return keras.Model(inputs=inputs, outputs=action) + +def create_env(env_name, seed=42): + try: + # Use the Baseline Atari environment because of Deepmind helper functions + env = make_atari(env_name) + # Warp the frames, grey scale, stake four frame and scale to smaller ratio + env = wrap_deepmind(env, frame_stack=True, scale=True) + print("Loaded gym") + env.seed(seed) + return env + except: + print("Failed to make gym env", env_name) + return None + +def run_sim(env, model, frame_count): + state = np.array(env.reset()) + total_reward = 0 + for i in range(frame_count): + env.render('human') + state_tensor = keras.backend.constant(state) + state_tensor = keras.backend.expand_dims(state_tensor, 0) + action_values = model(state_tensor, training=False) + # Take best action + action = keras.backend.argmax(action_values[0]).numpy() + state, reward, done, _ = env.step(action) + state = np.array(state) + total_reward += reward + if done: + print("Game over at frame", i, "rew", total_reward) + env.reset() + #break + #time.sleep(0.1) + print("Sim ended : rew is ", total_reward) + +def main(env_name, model_file,frame_count=1000, seed=42): + env = create_env(env_name=env_name) + assert env is not None, "Failed to make env " + env_name + model = create_q_model(num_actions=env.action_space.n) + model_testfile = model_file + ".data-00000-of-00001" + assert os.path.exists(model_testfile), "Failed to load model: " + model_testfile + print("Model weights look loadable", model_testfile) + model.load_weights(model_file) + print("Model loaded weights - starting sim") + run_sim(env, model, frame_count) + +main(env_name, model_file, frame_count=1000) +