{ "cells": [ { "cell_type": "markdown", "id": "9ab9a8f7-1d63-4bd5-9ccf-cf9b7ba51bd5", "metadata": {}, "source": [ "\n", "
\n", "
\n", " \"Department\n", "
\n", "
\n", "

CS-866 Deep Reinforcement Learning

\n", " \n", "

Notebook: Policy-Based RL: REINFORCE (Mountain Car)

\n", "

Instructor: Nazar Khan   |   Semester: Fall 2025

\n", "
\n", "
\n", " \"University\n", "
\n", "
\n", "\n", "---\n", "\n", "**This notebook** is a self-contained teaching and evaluation resource for the REINFORCE policy-gradient algorithm implemented on `MontainCarContinuous-v0`. It includes: theory, fully-commented PyTorch code, training with TensorBoard logging, recording (MP4/GIF), visualizations, and an assignment + rubric for students." ] }, { "cell_type": "code", "execution_count": 1, "id": "0f6266bc-cd04-48fb-8622-03ffd8f08158", "metadata": {}, "outputs": [], "source": [ "# REINFORCE on MountainCarContinuous-v0\n", "# produces both GIF and MP4, uses Gymnasium API (reset -> obs, info).\n", "#\n", "# Requirements: gymnasium, torch, numpy, matplotlib, imageio, tqdm\n", "# Optional (for MP4): ffmpeg installed on system (imageio uses ffmpeg)\n", "\n", "import gymnasium as gym\n", "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from collections import deque\n", "import imageio\n", "import matplotlib.pyplot as plt\n", "from IPython.display import HTML, display\n", "from tqdm import trange\n", "import os\n", "import math\n", "import tempfile" ] }, { "cell_type": "code", "execution_count": 7, "id": "8379b5be-ab76-435c-baf6-2ccb8b2464a5", "metadata": {}, "outputs": [], "source": [ "# -----------------------------\n", "# 0) Short explanation\n", "# -----------------------------\n", "# We're teaching a little \"car\" how to drive up a hill to reach a flag.\n", "# The car can push left or right with a continuous force (a number between -1 and 1).\n", "# REINFORCE is a method where the car tries many times and learns from the\n", "# whole trip (episode) how good its choices were.\n", "#\n", "# We use a neural network to pick the push (action) as a random number\n", "# from a small bell-shaped curve (Gaussian). The network learns to shift\n", "# the bell-curve so better actions happen more often.\n", "#\n", "# We'll record videos (GIF + MP4) so you can watch the car learn." ] }, { "cell_type": "code", "execution_count": 8, "id": "30fda3f2-093f-4c40-9f7f-ae7fd60f113c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Environment: MountainCarContinuous-v0\n", "State dimension: 2, Action dimension: 1\n", "Action bounds: [-1.0, 1.0] (we will clip actions to this)\n" ] } ], "source": [ "# -----------------------------\n", "# 1) Environment setup\n", "# -----------------------------\n", "env_id = \"MountainCarContinuous-v0\"\n", "\n", "# Create the environment with rgb frames available for rendering\n", "env = gym.make(env_id, render_mode=\"rgb_array\") # gymnasium returns frames via env.render()\n", "\n", "# Get dimensions and bounds\n", "state_dim = env.observation_space.shape[0] # normally 2: position and velocity\n", "action_dim = env.action_space.shape[0] # continuous action dimension (should be 1 here)\n", "action_low = float(env.action_space.low[0])\n", "action_high = float(env.action_space.high[0])\n", "\n", "print(f\"Environment: {env_id}\")\n", "print(f\"State dimension: {state_dim}, Action dimension: {action_dim}\")\n", "print(f\"Action bounds: [{action_low}, {action_high}] (we will clip actions to this)\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "a8552925-639b-411a-ac29-a155964403e8", "metadata": {}, "outputs": [], "source": [ "# -----------------------------\n", "# 2) Policy network (Gaussian policy)\n", "# -----------------------------\n", "# The network will produce the mean of the gaussian. We keep a small learnable\n", "# log-standard-deviation so the network can tune how exploratory it is.\n", "\n", "class PolicyNet(nn.Module):\n", " def __init__(self, state_dim, action_dim, hidden=128):\n", " super().__init__()\n", " # Simple 2-layer MLP\n", " self.fc1 = nn.Linear(state_dim, hidden)\n", " self.fc2 = nn.Linear(hidden, hidden)\n", " self.mean = nn.Linear(hidden, action_dim)\n", " # We'll use a separate learnable parameter for log std (shared per action dim)\n", " # Initialized to small negative value so initial std is modest.\n", " self.log_std = nn.Parameter(torch.ones(action_dim) * -0.5)\n", "\n", " def forward(self, x):\n", " # x is a torch tensor of shape (state_dim,)\n", " x = torch.tanh(self.fc1(x))\n", " x = torch.tanh(self.fc2(x))\n", " mean = self.mean(x) # shape: (action_dim,)\n", " std = torch.exp(self.log_std) # shape: (action_dim,)\n", " return mean, std\n", "\n", "# Instantiate policy and optimizer\n", "policy = PolicyNet(state_dim, action_dim, hidden=128)\n", "optimizer = optim.Adam(policy.parameters(), lr=1e-3)" ] }, { "cell_type": "code", "execution_count": 10, "id": "21bbb676-54f5-49b5-946e-429eee2d7a41", "metadata": {}, "outputs": [], "source": [ "# -----------------------------\n", "# 3) Helper functions: selecting actions & computing returns\n", "# -----------------------------\n", "def select_action(state_np):\n", " \"\"\"\n", " state_np: numpy array of state (shape (state_dim,))\n", " Returns: clipped_action (numpy), log_prob (torch scalar)\n", " \"\"\"\n", " # Convert numpy -> torch\n", " state = torch.tensor(state_np, dtype=torch.float32)\n", " mean, std = policy(state) # both torch tensors\n", " # Create a normal distribution per action dimension\n", " dist = torch.distributions.Normal(mean, std)\n", " action = dist.sample() # sample an action (torch)\n", " action_clipped = torch.clamp(action, action_low, action_high)\n", " # Sum log_prob across action dims to get scalar log probability\n", " log_prob = dist.log_prob(action).sum()\n", " return action_clipped.detach().numpy(), log_prob\n", "\n", "def compute_returns(rewards, gamma=0.99):\n", " \"\"\"\n", " Given a list of rewards for an episode, compute discounted returns G_t.\n", " Then normalize returns (important for stable REINFORCE learning).\n", " Returns: torch tensor of shape (len(rewards),)\n", " \"\"\"\n", " returns = []\n", " G = 0.0\n", " for r in reversed(rewards):\n", " G = r + gamma * G\n", " returns.insert(0, G)\n", " returns = torch.tensor(returns, dtype=torch.float32)\n", " # Normalize (subtract mean, divide by std) to reduce variance\n", " if returns.std().item() > 1e-8:\n", " returns = (returns - returns.mean()) / (returns.std() + 1e-8)\n", " else:\n", " returns = returns - returns.mean()\n", " return returns" ] }, { "cell_type": "code", "execution_count": 7, "id": "dcae85d4-9047-483f-aec3-7398762b1056", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Training episodes: 100%|ā–ˆ| 2000/2000 [23:29<00:00, 1.42it/s, ep_reward=-1.01, r\n" ] } ], "source": [ "# -----------------------------\n", "# 4) Training loop with early stopping & recording\n", "# -----------------------------\n", "# Hyperparameters (you can tweak them)\n", "num_episodes = 2000 # upper bound on episodes\n", "gamma = 0.99 # discount factor\n", "render_every = 200 # record a single frame every N episodes to create animation steps\n", "record_last_episodes = 5 # record a few full episodes after training to make better video\n", "early_stop_window = 100 # window for computing recent average reward\n", "early_stop_threshold = 90.0 # average reward threshold to consider solved (common for this env)\n", "# Explanation: MountainCarContinuous gives reward ~100 for success, so ~90 is good.\n", "\n", "# Logging structures\n", "episode_rewards = []\n", "recent_buffer = deque(maxlen=early_stop_window)\n", "frames_for_gif = [] # will store a handful of frames captured periodically\n", "frames_final_episodes = [] # will store full frames for final example episodes (for video)\n", "scale = 0.25 # scale for reducing frame size to save RAM\n", "\n", "# We'll use tqdm for nicer progress bar\n", "pbar = trange(num_episodes, desc=\"Training episodes\")\n", "\n", "# For stability, set a manual seed (optional)\n", "torch.manual_seed(42)\n", "np.random.seed(42)\n", "\n", "solved_episode = None\n", "\n", "for episode in pbar:\n", " # Gymnasium reset returns (obs, info)\n", " state, _ = env.reset()\n", " done = False\n", " rewards = []\n", " log_probs = []\n", "\n", " # Run one full episode (until terminated or truncated)\n", " while not done:\n", " # Optionally capture a frame every so often to make a slow-motion GIF of learning\n", " if (episode + 1) % render_every == 0:\n", " try:\n", " frame = env.render() # rgb array\n", " frame = cv2.resize(frame, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_AREA)\n", " frames_for_gif.append(frame)\n", " except Exception:\n", " pass # rendering might fail in headless environments; it's optional\n", "\n", " action, log_prob = select_action(state) # get action & its log probability\n", " next_state, reward, terminated, truncated, _ = env.step(action)\n", " done = bool(terminated or truncated)\n", "\n", " rewards.append(float(reward))\n", " log_probs.append(log_prob)\n", "\n", " state = next_state\n", "\n", " # Compute returns and policy loss\n", " returns = compute_returns(rewards, gamma=gamma) # torch tensor\n", " # Multiply -log_prob * return for each timestep (we minimize negative objective)\n", " policy_loss_terms = []\n", " for lp, G in zip(log_probs, returns):\n", " # -log_prob * G (we use negative because optimizers minimize)\n", " policy_loss_terms.append(-lp * G)\n", " policy_loss = torch.stack(policy_loss_terms).sum()\n", "\n", " # Gradient step\n", " optimizer.zero_grad()\n", " policy_loss.backward()\n", " optimizer.step()\n", "\n", " # Logging & early stopping check\n", " ep_reward = sum(rewards)\n", " episode_rewards.append(ep_reward)\n", " recent_buffer.append(ep_reward)\n", "\n", " # Update progress bar\n", " pbar.set_postfix({\"ep_reward\": f\"{ep_reward:.2f}\",\n", " \"recent_mean\": f\"{np.mean(recent_buffer):.2f}\" if len(recent_buffer)>0 else \"N/A\"})\n", "\n", " # Early stopping condition: average of last early_stop_window episodes above threshold\n", " if len(recent_buffer) == early_stop_window:\n", " recent_mean = float(np.mean(recent_buffer))\n", " if recent_mean >= early_stop_threshold:\n", " solved_episode = episode + 1\n", " print(f\"\\nšŸŽ‰ Early stopping at episode {solved_episode}: recent mean = {recent_mean:.2f}\")\n", " break\n", "\n", "# If training ended early, record a few full episodes to show final behavior\n", "# Record record_last_episodes full episodes (frames) for a nicer final video\n", "for i in range(record_last_episodes):\n", " s, _ = env.reset()\n", " done = False\n", " episode_frames = []\n", " while not done:\n", " # collect frames for video\n", " try:\n", " frame = env.render()\n", " frame = cv2.resize(frame, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_AREA)\n", " episode_frames.append(frame)\n", " except Exception:\n", " pass\n", " a, _ = select_action(s)\n", " s, r, terminated, truncated, _ = env.step(a)\n", " done = bool(terminated or truncated)\n", " frames_final_episodes.extend(episode_frames)\n", "\n", "# If no final frames captured (headless), we still might have periodic frames_for_gif\n", "if len(frames_final_episodes) == 0 and len(frames_for_gif) == 0:\n", " print(\"Warning: no frames were recorded (rendering may be disabled in this environment).\")\n", "\n", "# Close env\n", "env.close()" ] }, { "cell_type": "code", "execution_count": 11, "id": "4c965996-60d1-48bd-b801-e8df37b91232", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "IMAGEIO FFMPEG_WRITER WARNING: input image is not divisible by macro_block_size=16, resizing from (600, 400) to (608, 400) to ensure video compatibility with most codecs and players. To prevent resizing, make your input image divisible by the macro_block_size or set the macro_block_size to 1 (risking incompatibility).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Could not save progress GIF: Unable to allocate 6.70 GiB for an array with shape (9990, 400, 600, 3) and data type uint8\n", "Could not save final GIF: Unable to allocate 3.35 GiB for an array with shape (4995, 400, 600, 3) and data type uint8\n", "Could not save MP4 (ffmpeg may be missing). Error: [Errno 12] Cannot allocate memory\n", "Final GIF is available if created above.\n" ] } ], "source": [ "# -----------------------------\n", "# 5) Save animations: GIF + MP4 (if possible)\n", "# -----------------------------\n", "# We'll attempt to save:\n", "# - \"mountaincar_reinforce_progress.gif\": short GIF made from periodic frames_for_gif\n", "# - \"mountaincar_reinforce_final.mp4\": MP4 made from final full episode frames (if available)\n", "#\n", "# If ffmpeg is unavailable, MP4 saving may fail; GIF saving with imageio should still work.\n", "\n", "out_dir = \"reinforce_videos\"\n", "os.makedirs(out_dir, exist_ok=True)\n", "\n", "# Helper to scale frames (optional): ensure frames are uint8 arrays\n", "def _to_uint8(frame):\n", " # frame might already be uint8; if floats, scale to 0..255\n", " if frame.dtype == np.uint8:\n", " return frame\n", " f = np.clip(frame, 0.0, 1.0)\n", " f = (255.0 * f).astype(np.uint8)\n", " return f\n", "\n", "# 5a) Save short progress GIF if we have periodic frames\n", "gif_path = os.path.join(out_dir, \"mountaincar_reinforce_progress.gif\")\n", "if len(frames_for_gif) > 0:\n", " try:\n", " frames_uint8 = [_to_uint8(f) for f in frames_for_gif]\n", " # Make the GIF (speed up by setting fps)\n", " imageio.mimsave(gif_path, frames_uint8, fps=15)\n", " print(f\"Saved progress GIF to: {gif_path} (frames used: {len(frames_uint8)})\")\n", " except Exception as e:\n", " print(\"Could not save progress GIF:\", e)\n", "else:\n", " print(\"No periodic frames collected for progress GIF.\")\n", "\n", "# 5b) Save final episodes MP4/GIF using frames_final_episodes if available\n", "final_gif_path = os.path.join(out_dir, \"mountaincar_reinforce_final.gif\")\n", "final_mp4_path = os.path.join(out_dir, \"mountaincar_reinforce_final.mp4\")\n", "if len(frames_final_episodes) > 0:\n", " frames_uint8_final = [_to_uint8(f) for f in frames_final_episodes]\n", " # Save final GIF\n", " try:\n", " imageio.mimsave(final_gif_path, frames_uint8_final, fps=30)\n", " print(f\"Saved final GIF to: {final_gif_path} (frames: {len(frames_uint8_final)})\")\n", " except Exception as e:\n", " print(\"Could not save final GIF:\", e)\n", "\n", " # Try saving MP4 (may require ffmpeg)\n", " try:\n", " # imageio's ffmpeg writer uses plugin \"ffmpeg\"\n", " imageio.mimsave(final_mp4_path, frames_uint8_final, fps=30, codec=\"libx264\")\n", " print(f\"Saved final MP4 to: {final_mp4_path}\")\n", " except Exception as e:\n", " print(\"Could not save MP4 (ffmpeg may be missing). Error:\", e)\n", " print(\"Final GIF is available if created above.\")\n", "else:\n", " print(\"No full-episode frames collected for final video.\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "2b2c6835-b093-4826-8adf-6a7dd415ea24", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Attempting to display saved media (if any). If files are not shown, check the 'reinforce_videos' folder.\n", "No media displayed inline. Look in the 'reinforce_videos' folder for created files.\n" ] } ], "source": [ "# -----------------------------\n", "# 6) Plot training curves and show animations inline (if possible)\n", "# -----------------------------\n", "# Plot the reward per episode and a moving average.\n", "plt.figure(figsize=(12,5))\n", "plt.plot(episode_rewards, alpha=0.6, label=\"Episode reward\")\n", "# moving average\n", "window = 50\n", "if len(episode_rewards) >= window:\n", " mov_avg = np.convolve(episode_rewards, np.ones(window)/window, mode=\"valid\")\n", " plt.plot(range(window-1, window-1+len(mov_avg)), mov_avg, linewidth=3, label=f\"Moving avg ({window})\")\n", "plt.axhline(early_stop_threshold, color=\"green\", linestyle=\"--\", label=f\"Early stop threshold ({early_stop_threshold})\")\n", "plt.xlabel(\"Episode\")\n", "plt.ylabel(\"Total reward (sum per episode)\")\n", "plt.title(\"REINFORCE training on MountainCarContinuous-v0\")\n", "plt.legend()\n", "plt.grid()\n", "plt.show()\n", "\n", "# Try to display final GIF and MP4 inline (Jupyter)\n", "def try_display(path):\n", " if not os.path.exists(path):\n", " return False\n", " ext = os.path.splitext(path)[1].lower()\n", " if ext == \".gif\":\n", " display(HTML(f'\"gif\"'))\n", " return True\n", " if ext == \".mp4\":\n", " # HTML5 video tag for mp4\n", " display(HTML(f\"\"\"\n", " \n", " \"\"\"))\n", " return True\n", " return False\n", "\n", "print(\"\\nAttempting to display saved media (if any). If files are not shown, check the 'reinforce_videos' folder.\")\n", "shown = False\n", "# Prefer final mp4 then final gif then progress gif\n", "if try_display(final_mp4_path):\n", " shown = True\n", "elif try_display(final_gif_path):\n", " shown = True\n", "elif try_display(gif_path):\n", " shown = True\n", "\n", "if not shown:\n", " print(\"No media displayed inline. Look in the 'reinforce_videos' folder for created files.\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "f587b3b4-f944-4d8e-b37e-59d0319b4e28", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Done running training cell.\n", "Training finished without meeting early-stop threshold. You can:\n", "- Increase num_episodes\n", "- Try lowering the learning rate or increasing network size\n", "- Use a baseline (actor-critic) for faster learning\n", "\n", "Files are in the folder: /home/nazar/teaching/CS866/notebooks/Reinforce/reinforce_videos\n", "If MP4 saving failed, ensure ffmpeg is installed on your system (e.g., 'sudo apt install ffmpeg' on Ubuntu).\n" ] } ], "source": [ "# -----------------------------\n", "# 7) Short tips & next steps (printed for the user)\n", "# -----------------------------\n", "print(\"\\nDone running training cell.\")\n", "if solved_episode is not None:\n", " print(f\"Environment considered solved at episode {solved_episode} (avg >= {early_stop_threshold}).\")\n", "else:\n", " print(\"Training finished without meeting early-stop threshold. You can:\")\n", " print(\"- Increase num_episodes\")\n", " print(\"- Try lowering the learning rate or increasing network size\")\n", " print(\"- Use a baseline (actor-critic) for faster learning\")\n", "\n", "print(\"\\nFiles are in the folder:\", os.path.abspath(out_dir))\n", "print(\"If MP4 saving failed, ensure ffmpeg is installed on your system (e.g., 'sudo apt install ffmpeg' on Ubuntu).\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e366ee88-087a-49ab-b44f-659faa460b27", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" } }, "nbformat": 4, "nbformat_minor": 5 }