Spaces:

fahmizainal17
/

Q-Learning_GridWorld_Simulator

Running

App Files Files Community

Q-Learning_GridWorld_Simulator / rl_gradio.py

fahmizainal17

Upload folder using huggingface_hub

2c8df23 verified 17 days ago

raw

history blame contribute delete

20.1 kB

	import numpy as np
	import matplotlib.pyplot as plt
	import gradio as gr
	import time
	from matplotlib.colors import ListedColormap
	import matplotlib.patches as patches

	class GridWorld:
	"""A simple grid world environment with obstacles."""

	def __init__(self, height=4, width=4):
	# Grid dimensions
	self.height = height
	self.width = width

	# Define states
	self.n_states = self.height * self.width

	# Actions: 0: up, 1: right, 2: down, 3: left
	self.n_actions = 4
	self.action_names = ['Up', 'Right', 'Down', 'Left']

	# Define rewards
	self.rewards = np.zeros((self.height, self.width))
	# Goal state
	self.rewards[self.height-1, self.width-1] = 1.0
	# Obstacles (negative reward)
	self.obstacles = []
	if height >= 4 and width >= 4:
	self.rewards[1, 1] = -1.0
	self.rewards[1, 2] = -1.0
	self.rewards[2, 1] = -1.0
	self.obstacles = [(1, 1), (1, 2), (2, 1)]

	# Start state
	self.start_state = (0, 0)

	# Goal state
	self.goal_state = (self.height-1, self.width-1)

	# Reset the environment
	self.reset()

	def reset(self):
	"""Reset the agent to the start state."""
	self.agent_position = self.start_state
	return self._get_state()

	def _get_state(self):
	"""Convert the agent's (row, col) position to a state number."""
	row, col = self.agent_position
	return row * self.width + col

	def _get_pos_from_state(self, state):
	"""Convert a state number to (row, col) position."""
	row = state // self.width
	col = state % self.width
	return (row, col)

	def step(self, action):
	"""Take an action and return next_state, reward, done."""
	row, col = self.agent_position

	# Apply the action
	if action == 0: # up
	row = max(0, row - 1)
	elif action == 1: # right
	col = min(self.width - 1, col + 1)
	elif action == 2: # down
	row = min(self.height - 1, row + 1)
	elif action == 3: # left
	col = max(0, col - 1)

	# Update agent position
	self.agent_position = (row, col)

	# Get reward
	reward = self.rewards[row, col]

	# Check if episode is done
	done = (row, col) == self.goal_state

	return self._get_state(), reward, done

	class QLearningAgent:
	"""A simple Q-learning agent."""

	def __init__(self, n_states, n_actions, learning_rate=0.1, discount_factor=0.9, exploration_rate=1.0, exploration_decay=0.995):
	"""Initialize the Q-learning agent."""
	self.n_states = n_states
	self.n_actions = n_actions
	self.learning_rate = learning_rate
	self.discount_factor = discount_factor
	self.exploration_rate = exploration_rate
	self.exploration_decay = exploration_decay

	# Initialize Q-table
	self.q_table = np.zeros((n_states, n_actions))

	# Track visited states for visualization
	self.visit_counts = np.zeros(n_states)

	# Training metrics
	self.rewards_history = []
	self.exploration_rates = []

	def select_action(self, state):
	"""Select an action using epsilon-greedy policy."""
	if np.random.random() < self.exploration_rate:
	# Explore: select a random action
	return np.random.randint(self.n_actions)
	else:
	# Exploit: select the action with the highest Q-value
	return np.argmax(self.q_table[state])

	def update(self, state, action, reward, next_state, done):
	"""Update the Q-table using the Q-learning update rule."""
	# Calculate the Q-target
	if done:
	q_target = reward
	else:
	q_target = reward + self.discount_factor * np.max(self.q_table[next_state])

	# Update the Q-value
	self.q_table[state, action] += self.learning_rate * (q_target - self.q_table[state, action])

	# Update visit count for visualization
	self.visit_counts[state] += 1

	def decay_exploration(self):
	"""Decay the exploration rate."""
	self.exploration_rate *= self.exploration_decay
	self.exploration_rates.append(self.exploration_rate)

	def get_policy(self):
	"""Return the current greedy policy."""
	return np.argmax(self.q_table, axis=1)

	def reset(self):
	"""Reset the agent for a new training session."""
	self.q_table = np.zeros((self.n_states, self.n_actions))
	self.visit_counts = np.zeros(self.n_states)
	self.rewards_history = []
	self.exploration_rates = []


	def create_gridworld_figure(env, agent, episode_count=0, total_reward=0):
	"""Create a figure with environment, visit heatmap, and Q-values."""
	fig, axes = plt.subplots(1, 3, figsize=(15, 5))
	fig.suptitle(f"Episode: {episode_count}, Total Reward: {total_reward:.2f}, Exploration Rate: {agent.exploration_rate:.2f}")

	# Define colors for different cell types
	colors = {
	'empty': 'white',
	'obstacle': 'black',
	'goal': 'green',
	'start': 'blue',
	'agent': 'red'
	}

	# Helper function to draw grid
	def draw_grid(ax):
	# Create a grid
	for i in range(env.height + 1):
	ax.axhline(i, color='black', lw=1)
	for j in range(env.width + 1):
	ax.axvline(j, color='black', lw=1)

	# Set limits and remove ticks
	ax.set_xlim(0, env.width)
	ax.set_ylim(0, env.height)
	ax.invert_yaxis() # Invert y-axis to match grid coordinates
	ax.set_xticks(np.arange(0.5, env.width, 1))
	ax.set_yticks(np.arange(0.5, env.height, 1))
	ax.set_xticklabels(range(env.width))
	ax.set_yticklabels(range(env.height))

	# Helper function to draw a cell
	def draw_cell(ax, row, col, cell_type):
	color = colors.get(cell_type, 'white')
	rect = patches.Rectangle((col, row), 1, 1, linewidth=1, edgecolor='black', facecolor=color, alpha=0.7)
	ax.add_patch(rect)

	# Helper function to draw an arrow
	def draw_arrow(ax, row, col, action):
	# Coordinates for arrows
	arrow_starts = {
	0: (col + 0.5, row + 0.7), # up
	1: (col + 0.3, row + 0.5), # right
	2: (col + 0.5, row + 0.3), # down
	3: (col + 0.7, row + 0.5) # left
	}

	arrow_ends = {
	0: (col + 0.5, row + 0.3), # up
	1: (col + 0.7, row + 0.5), # right
	2: (col + 0.5, row + 0.7), # down
	3: (col + 0.3, row + 0.5) # left
	}

	ax.annotate('', xy=arrow_ends[action], xytext=arrow_starts[action],
	arrowprops=dict(arrowstyle='->', lw=2, color='blue'))

	# Draw Environment
	ax = axes[0]
	ax.set_title('GridWorld Environment')
	draw_grid(ax)

	# Draw cells
	for i in range(env.height):
	for j in range(env.width):
	if (i, j) in env.obstacles:
	draw_cell(ax, i, j, 'obstacle')
	elif (i, j) == env.goal_state:
	draw_cell(ax, i, j, 'goal')
	elif (i, j) == env.start_state:
	draw_cell(ax, i, j, 'start')

	# Draw agent
	row, col = env.agent_position
	draw_cell(ax, row, col, 'agent')

	# Draw policy arrows
	policy = agent.get_policy()
	for state in range(env.n_states):
	row, col = env._get_pos_from_state(state)
	if (row, col) not in env.obstacles and (row, col) != env.goal_state:
	draw_arrow(ax, row, col, policy[state])

	# Ensure proper aspect ratio
	ax.set_aspect('equal')

	# Draw Visit Heatmap
	ax = axes[1]
	ax.set_title('State Visitation Heatmap')
	draw_grid(ax)

	# Create heatmap data
	heatmap_data = np.zeros((env.height, env.width))
	for state in range(env.n_states):
	row, col = env._get_pos_from_state(state)
	heatmap_data[row, col] = agent.visit_counts[state]

	# Normalize values for coloring
	max_visits = max(1, np.max(heatmap_data))

	# Draw heatmap
	for i in range(env.height):
	for j in range(env.width):
	if (i, j) in env.obstacles:
	draw_cell(ax, i, j, 'obstacle')
	elif (i, j) == env.goal_state:
	draw_cell(ax, i, j, 'goal')
	else:
	intensity = heatmap_data[i, j] / max_visits
	color = plt.cm.viridis(intensity)
	rect = patches.Rectangle((j, i), 1, 1, linewidth=1, edgecolor='black', facecolor=color, alpha=0.7)
	ax.add_patch(rect)
	# Add visit count text
	if heatmap_data[i, j] > 0:
	ax.text(j + 0.5, i + 0.5, int(heatmap_data[i, j]), ha='center', va='center', color='white' if intensity > 0.5 else 'black')

	# Ensure proper aspect ratio
	ax.set_aspect('equal')

	# Draw Q-values
	ax = axes[2]
	ax.set_title('Q-Values')
	draw_grid(ax)

	# Draw Q-values for each cell
	for state in range(env.n_states):
	row, col = env._get_pos_from_state(state)

	if (row, col) in env.obstacles:
	draw_cell(ax, row, col, 'obstacle')
	continue

	if (row, col) == env.goal_state:
	draw_cell(ax, row, col, 'goal')
	continue

	# Calculate q-values for each action
	q_values = agent.q_table[state]

	# Draw arrows proportional to Q-values
	for action in range(env.n_actions):
	q_value = q_values[action]

	# Only draw arrows for positive Q-values
	if q_value > 0:
	# Normalize arrow size
	max_q = max(0.1, np.max(q_values))
	arrow_size = 0.3 * (q_value / max_q)

	# Position calculations
	center_x = col + 0.5
	center_y = row + 0.5

	# Direction vectors
	directions = [
	(0, -arrow_size), # up
	(arrow_size, 0), # right
	(0, arrow_size), # down
	(-arrow_size, 0) # left
	]

	dx, dy = directions[action]

	# Draw arrow
	ax.arrow(center_x, center_y, dx, dy, head_width=0.1, head_length=0.1,
	fc='blue', ec='blue', alpha=0.7)

	# Add Q-value text
	text_positions = [
	(center_x, center_y - 0.25), # up
	(center_x + 0.25, center_y), # right
	(center_x, center_y + 0.25), # down
	(center_x - 0.25, center_y) # left
	]

	tx, ty = text_positions[action]
	ax.text(tx, ty, f"{q_value:.2f}", ha='center', va='center', fontsize=8,
	bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.1'))

	# Ensure proper aspect ratio
	ax.set_aspect('equal')

	plt.tight_layout()
	return fig

	def create_metrics_figure(agent):
	"""Create a figure with training metrics."""
	fig, axes = plt.subplots(1, 2, figsize=(12, 4))

	# Plot rewards
	if agent.rewards_history:
	axes[0].plot(agent.rewards_history)
	axes[0].set_title('Rewards per Episode')
	axes[0].set_xlabel('Episode')
	axes[0].set_ylabel('Total Reward')
	axes[0].grid(True)
	else:
	axes[0].set_title('No reward data yet')

	# Plot exploration rate
	if agent.exploration_rates:
	axes[1].plot(agent.exploration_rates)
	axes[1].set_title('Exploration Rate Decay')
	axes[1].set_xlabel('Episode')
	axes[1].set_ylabel('Exploration Rate (ε)')
	axes[1].grid(True)
	else:
	axes[1].set_title('No exploration rate data yet')

	plt.tight_layout()
	return fig

	def train_single_episode(env, agent):
	"""Train for a single episode and return the total reward."""
	state = env.reset()
	total_reward = 0
	done = False
	steps = 0
	max_steps = env.width * env.height * 3 # Prevent infinite loops

	while not done and steps < max_steps:
	# Select action
	action = agent.select_action(state)

	# Take the action
	next_state, reward, done = env.step(action)

	# Update the Q-table
	agent.update(state, action, reward, next_state, done)

	# Update state and total reward
	state = next_state
	total_reward += reward
	steps += 1

	# Decay exploration rate
	agent.decay_exploration()

	# Store the total reward
	agent.rewards_history.append(total_reward)

	return total_reward

	def train_agent(env, agent, episodes, progress=gr.Progress()):
	"""Train the agent for a specified number of episodes."""
	progress_text = ""
	progress(0, desc="Starting training...")

	for episode in progress.tqdm(range(episodes)):
	total_reward = train_single_episode(env, agent)

	if (episode + 1) % 10 == 0 or episode == episodes - 1:
	progress_text += f"Episode {episode + 1}/{episodes}, Reward: {total_reward}, Exploration: {agent.exploration_rate:.3f}\n"

	# Create final visualization
	env_fig = create_gridworld_figure(env, agent, episode_count=episodes, total_reward=total_reward)
	metrics_fig = create_metrics_figure(agent)

	return env_fig, metrics_fig, progress_text

	def run_test_episode(env, agent):
	"""Run a test episode using the learned policy."""
	state = env.reset()
	total_reward = 0
	done = False
	path = [env._get_pos_from_state(state)]
	steps = 0
	max_steps = env.width * env.height * 3 # Prevent infinite loops

	while not done and steps < max_steps:
	# Select the best action from the learned policy
	action = np.argmax(agent.q_table[state])

	# Take the action
	next_state, reward, done = env.step(action)

	# Update state and total reward
	state = next_state
	total_reward += reward
	path.append(env._get_pos_from_state(state))
	steps += 1

	# Create visualization
	env_fig = create_gridworld_figure(env, agent, episode_count="Test", total_reward=total_reward)

	# Format path for display
	path_text = "Path taken:\n"
	for i, pos in enumerate(path):
	path_text += f"Step {i}: {pos}\n"

	return env_fig, path_text, f"Test completed with total reward: {total_reward}"

	def create_ui():
	"""Create the Gradio interface."""
	# Create environment and agent
	env = GridWorld(height=4, width=4)
	agent = QLearningAgent(
	n_states=env.n_states,
	n_actions=env.n_actions,
	learning_rate=0.1,
	discount_factor=0.9,
	exploration_rate=1.0,
	exploration_decay=0.995
	)

	# Create initial visualizations
	init_env_fig = create_gridworld_figure(env, agent)
	init_metrics_fig = create_metrics_figure(agent)

	with gr.Blocks(title="Q-Learning GridWorld Simulator") as demo:
	gr.Markdown("# Q-Learning GridWorld Simulator")

	with gr.Tab("Environment Setup"):
	with gr.Row():
	with gr.Column():
	grid_height = gr.Slider(minimum=3, maximum=8, value=4, step=1, label="Grid Height")
	grid_width = gr.Slider(minimum=3, maximum=8, value=4, step=1, label="Grid Width")
	setup_btn = gr.Button("Setup Environment")

	env_display = gr.Plot(value=init_env_fig, label="Environment")

	with gr.Row():
	setup_info = gr.Textbox(label="Environment Info", value="4x4 GridWorld with start at (0,0) and goal at (3,3)")

	with gr.Tab("Train Agent"):
	with gr.Row():
	with gr.Column():
	learning_rate = gr.Slider(minimum=0.01, maximum=1.0, value=0.1, step=0.01, label="Learning Rate (α)")
	discount_factor = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.01, label="Discount Factor (γ)")
	exploration_rate = gr.Slider(minimum=0.1, maximum=1.0, value=1.0, step=0.01, label="Initial Exploration Rate (ε)")
	exploration_decay = gr.Slider(minimum=0.9, maximum=0.999, value=0.995, step=0.001, label="Exploration Decay Rate")
	episodes = gr.Slider(minimum=10, maximum=500, value=100, step=10, label="Number of Episodes")
	train_btn = gr.Button("Train Agent")

	with gr.Row():
	train_env_display = gr.Plot(label="Training Environment")
	train_metrics_display = gr.Plot(label="Training Metrics")

	train_log = gr.Textbox(label="Training Log", lines=10)

	with gr.Tab("Test Agent"):
	with gr.Row():
	test_btn = gr.Button("Test Trained Agent")

	with gr.Row():
	test_env_display = gr.Plot(label="Test Environment")

	with gr.Row():
	with gr.Column():
	path_display = gr.Textbox(label="Path Taken", lines=10)
	test_result = gr.Textbox(label="Test Result")

	# Setup environment callback
	def setup_environment(height, width):
	nonlocal env, agent
	env = GridWorld(height=int(height), width=int(width))
	agent = QLearningAgent(
	n_states=env.n_states,
	n_actions=env.n_actions,
	learning_rate=0.1,
	discount_factor=0.9,
	exploration_rate=1.0,
	exploration_decay=0.995
	)
	env_fig = create_gridworld_figure(env, agent)
	info_text = f"{height}x{width} GridWorld with start at (0,0) and goal at ({height-1},{width-1})"
	if env.obstacles:
	info_text += f"\nObstacles at: {env.obstacles}"
	return env_fig, info_text

	setup_btn.click(
	setup_environment,
	inputs=[grid_height, grid_width],
	outputs=[env_display, setup_info]
	)

	# Train agent callback
	def start_training(lr, df, er, ed, eps):
	nonlocal env, agent
	agent = QLearningAgent(
	n_states=env.n_states,
	n_actions=env.n_actions,
	learning_rate=float(lr),
	discount_factor=float(df),
	exploration_rate=float(er),
	exploration_decay=float(ed)
	)
	env_fig, metrics_fig, log = train_agent(env, agent, int(eps))
	return env_fig, metrics_fig, log

	train_btn.click(
	start_training,
	inputs=[learning_rate, discount_factor, exploration_rate, exploration_decay, episodes],
	outputs=[train_env_display, train_metrics_display, train_log]
	)

	# Test agent callback
	def test_trained_agent():
	nonlocal env, agent
	env_fig, path_text, result = run_test_episode(env, agent)
	return env_fig, path_text, result

	test_btn.click(
	test_trained_agent,
	inputs=[],
	outputs=[test_env_display, path_display, test_result]
	)

	return demo

	if __name__ == "__main__":
	# Install required packages
	# !pip install gradio matplotlib numpy

	# Create and launch the UI
	demo = create_ui()
	demo.launch(share=True)