I am trying to use this flexible ddpg implementation to get train a model with the least possible peb (position error bound) of a RIS (reconfigurable intelligent surface). A problem I am facing is that the numbers are coming out to be a bit erratic, like the rewards(calculated as 1/peb) and peb itself, which should minimize itself to lesser than the previous episode's peb(ideally speaking) is just bouncing around. This most obviously would mean that the RL model is not configured right. If that is the case, can someone let me know if the code here has any fault?
Also, flexible DDPG is just DDPG with epsilon greedy noise addition in terms of selecting action as well as choosing whether to explore or not based on the same epsilon value.
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import copy
class ActorNetwork(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dims=[400, 300]):
super(ActorNetwork, self).__init__()
layers = []
prev_dim = state_dim
for hidden_dim in hidden_dims:
layers.extend([
nn.Linear(prev_dim, hidden_dim),
nn.ReLU(),
nn.LayerNorm(hidden_dim)
])
prev_dim = hidden_dim
layers.append(nn.Linear(prev_dim, action_dim))
layers.append(nn.Tanh())
self.policy_network = nn.Sequential(*layers)
def forward(self, state):
return self.policy_network(state)
def get_action(self, state):
action = self.forward(state)
return action
class CriticNetwork(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dims=[400, 300]):
super(CriticNetwork, self).__init__()
self.state_layer = nn.Sequential(
nn.Linear(state_dim, hidden_dims[0]),
nn.ReLU(),
nn.LayerNorm(hidden_dims[0])
)
layers = []
prev_dim = hidden_dims[0] + action_dim
for hidden_dim in hidden_dims[1:]:
layers.extend([
nn.Linear(prev_dim, hidden_dim),
nn.ReLU(),
nn.LayerNorm(hidden_dim)
])
prev_dim = hidden_dim
layers.append(nn.Linear(prev_dim, 1))
self.q_network = nn.Sequential(*layers)
def forward(self, state, action):
state_features = self.state_layer(state)
combined = torch.cat([state_features, action], dim=1)
q_value = self.q_network(combined)
return q_value
class TargetActorNetwork(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dims=[400, 300], tau=0.001):
super(TargetActorNetwork, self).__init__()
self.tau = tau
self.online_network = ActorNetwork(state_dim, action_dim, hidden_dims)
self.target_network = copy.deepcopy(self.online_network)
for param in self.target_network.parameters():
param.requires_grad = False
def forward(self, state):
return self.target_network(state)
def soft_update(self):
for target_param, online_param in zip(self.target_network.parameters(),
self.online_network.parameters()):
target_param.data.copy_(
self.tau * online_param.data + (1.0 - self.tau) * target_param.data
)
def get_online_network(self):
return self.online_network
def get_target_network(self):
return self.target_network
class TargetCriticNetwork(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dims=[400, 300], tau=0.001):
super(TargetCriticNetwork, self).__init__()
self.tau = tau
self.online_network = CriticNetwork(state_dim, action_dim, hidden_dims)
self.target_network = copy.deepcopy(self.online_network)
for param in self.target_network.parameters():
param.requires_grad = False
def forward(self, state, action):
return self.target_network(state, action)
def soft_update(self):
for target_param, online_param in zip(self.target_network.parameters(),
self.online_network.parameters()):
target_param.data.copy_(
self.tau * online_param.data + (1.0 - self.tau) * target_param.data
)
def get_online_network(self):
return self.online_network
def get_target_network(self):
return self.target_network
class ReplayBuffer:
def __init__(self, capacity, state_dim):
self.buffer = deque(maxlen=capacity)
self.state_dim = state_dim
def push(self, state, action, reward, next_state):
state = np.squeeze(np.array(state))
next_state = np.squeeze(np.array(next_state))
assert state.shape == (self.state_dim,), f"State shape mismatch: {state.shape}"
assert next_state.shape == (self.state_dim,), f"Next state shape mismatch: {next_state.shape}"
self.buffer.append((state, action, reward, next_state))
def __len__(self):
return len(self.buffer)
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state = zip(*batch)
state = np.array([np.squeeze(s) for s in state])
next_state = np.array([np.squeeze(ns) for ns in next_state])
action = np.array(action)
reward = np.array(reward).reshape(-1, 1)
assert state.shape == (batch_size, self.state_dim), f"State shape mismatch: {state.shape}"
assert next_state.shape == (batch_size, self.state_dim), f"Next state shape mismatch: {next_state.shape}"
assert action.shape[0] == batch_size, f"Action batch size mismatch: {action.shape[0]} != {batch_size}"
assert reward.shape[0] == batch_size, f"Reward batch size mismatch: {reward.shape[0]} != {batch_size}"
return (torch.FloatTensor(state),
torch.FloatTensor(action),
torch.FloatTensor(reward),
torch.FloatTensor(next_state))
class FLDDPG:
def __init__(self, state_dim, action_dim, hidden_dims=[400, 300], buffer_size=10000,
batch_size=16, gamma=0.95, tau=0.00001, actor_lr=1e-3, critic_lr=1e-3,
lr_decay_rate=0.995, min_lr=1e-6):
#? Selecting GPU based on device
if(torch.backends.mps.is_available()):
self.device = torch.device("mps")
else:
self.device = torch.device("cuda")
self.target_actor = TargetActorNetwork(
state_dim=state_dim,
action_dim=action_dim,
hidden_dims=hidden_dims,
tau=tau
).to(self.device)
self.target_critic = TargetCriticNetwork(
state_dim=state_dim,
action_dim=action_dim,
hidden_dims=hidden_dims,
tau=tau
).to(self.device)
self.actor = self.target_actor.get_online_network()
self.critic = self.target_critic.get_online_network()
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
self.gamma = gamma
self.batch_size = batch_size
self.replay_buffer = ReplayBuffer(buffer_size, state_dim)
self.initial_actor_lr = actor_lr
self.initial_critic_lr = critic_lr
self.lr_decay_rate = lr_decay_rate
self.min_lr = min_lr
self.current_actor_lr = actor_lr
self.current_critic_lr = critic_lr
self.actor_scheduler = optim.lr_scheduler.ExponentialLR(
optimizer=self.actor_optimizer,
gamma=lr_decay_rate
)
self.critic_scheduler = optim.lr_scheduler.ExponentialLR(
optimizer=self.critic_optimizer,
gamma=lr_decay_rate
)
def decay_learning_rates(self):
self.actor_scheduler.step()
for param_group in self.actor_optimizer.param_groups:
param_group['lr'] = max(param_group['lr'], self.min_lr)
self.current_actor_lr = self.actor_optimizer.param_groups[0]['lr']
self.critic_scheduler.step()
for param_group in self.critic_optimizer.param_groups:
param_group['lr'] = max(param_group['lr'], self.min_lr)
self.current_critic_lr = self.critic_optimizer.param_groups[0]['lr']
def select_action(self, state, explore, epsilon):
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
with torch.no_grad():
if explore:
action = self.actor.get_action(state)
noise_scale = epsilon * np.random.normal(0, 1, size=action.shape)
action = torch.clamp(action + torch.FloatTensor(noise_scale).to(self.device), -1, 1)
else:
action = self.actor(state)
return action.cpu().numpy().squeeze()
def update(self):
if len(self.replay_buffer) < self.batch_size * 3:
return 0, 0
state_batch, action_batch, reward_batch, next_state_batch = self.replay_buffer.sample(self.batch_size)
state_batch = state_batch.to(self.device)
action_batch = action_batch.to(self.device)
reward_batch = reward_batch.to(self.device)
next_state_batch = next_state_batch.to(self.device)
with torch.no_grad():
next_actions = self.target_actor(next_state_batch)
target_q = reward_batch + self.gamma * self.target_critic(next_state_batch, next_actions)
current_q = self.critic(state_batch, action_batch)
critic_loss = torch.nn.MSELoss()(current_q, target_q)
self.critic_optimizer.zero_grad()
critic_loss.backward()
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0)
self.critic_optimizer.step()
actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=1.0)
self.actor_optimizer.step()
self.target_actor.soft_update()
self.target_critic.soft_update()
return actor_loss.item(), critic_loss.item()
I tried to see if the error was in the simulation end, but the values there seemed just as they were supposed to be. The flow of the program seems correct as well with the following training loop:
for episode in range(num_episodes):
matlab_state = self.eng.reset(self.sim)
state = self.process_state(matlab_state)
episode_reward = 0
episode_losses = {'actor': [], 'critic': []}
initial_peb = float(self.eng.calculatePerformanceMetrics(self.sim))
min_peb_in_episode = initial_peb
peb_values_in_episode = [initial_peb]
step_counter = 0
for step in range(max_steps):
value = np.random.uniform(0,1)
if (value < epsilon_start):
explore = True
else:
explore = False
step_counter = step_counter + 1
action = self.agent.select_action(state, explore, epsilon_start)
matlab_action = matlab.double(action.tolist())
next_matlab_state, reward, done = self.eng.step(self.sim, matlab_action, nargout=3)
current_peb = float(self.eng.calculatePerformanceMetrics(self.sim))
rate_values[episode].append(float(self.eng.getrate(self.sim)))
peb_values_in_episode.append(current_peb)
min_peb_in_episode = min(min_peb_in_episode, current_peb)
next_state = self.process_state(next_matlab_state)
done = bool(done)
self.agent.replay_buffer.push(state, action, reward, next_state)
actor_loss, critic_loss = self.agent.update()
episode_reward += reward
if actor_loss is not None:
episode_losses['actor'].append(actor_loss)
episode_losses['critic'].append(critic_loss)
state = next_state
epsilon_start = max(epsilon_en, epsilon_start*epsilon_decay_rate)
if done:
prev_peb = current_peb
break
episode_reward = episode_reward/step_counter