python - Is this a correct Implementation of Flexible DDPG?

I am trying to use this flexible ddpg implementation to get train a model with the least possible peb (position error bound) of a RIS (reconfigurable intelligent surface). A problem I am facing is that the numbers are coming out to be a bit erratic, like the rewards(calculated as 1/peb) and peb itself, which should minimize itself to lesser than the previous episode's peb(ideally speaking) is just bouncing around. This most obviously would mean that the RL model is not configured right. If that is the case, can someone let me know if the code here has any fault?

Also, flexible DDPG is just DDPG with epsilon greedy noise addition in terms of selecting action as well as choosing whether to explore or not based on the same epsilon value.

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import copy

class ActorNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dims=[400, 300]):
        super(ActorNetwork, self).__init__()
        layers = []
        prev_dim = state_dim
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.LayerNorm(hidden_dim) 
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, action_dim))
        layers.append(nn.Tanh())
        
        self.policy_network = nn.Sequential(*layers)
    
    def forward(self, state):
        return self.policy_network(state)
    
    def get_action(self, state):
        action = self.forward(state)
        return action

class CriticNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dims=[400, 300]):
        super(CriticNetwork, self).__init__()
        self.state_layer = nn.Sequential(
            nn.Linear(state_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.LayerNorm(hidden_dims[0])
        )
        layers = []
        prev_dim = hidden_dims[0] + action_dim
        for hidden_dim in hidden_dims[1:]:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.LayerNorm(hidden_dim)
            ])
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, 1))
        
        self.q_network = nn.Sequential(*layers)
        
    def forward(self, state, action):
        state_features = self.state_layer(state)
        combined = torch.cat([state_features, action], dim=1)
        q_value = self.q_network(combined)
        
        return q_value

class TargetActorNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dims=[400, 300], tau=0.001):
        super(TargetActorNetwork, self).__init__()
        
        self.tau = tau
        self.online_network = ActorNetwork(state_dim, action_dim, hidden_dims)
        self.target_network = copy.deepcopy(self.online_network)
        for param in self.target_network.parameters():
            param.requires_grad = False
            
    def forward(self, state):
        return self.target_network(state)
    
    def soft_update(self):
        for target_param, online_param in zip(self.target_network.parameters(), 
                                            self.online_network.parameters()):
            target_param.data.copy_(
                self.tau * online_param.data + (1.0 - self.tau) * target_param.data
            )
    
    def get_online_network(self):
        return self.online_network
    
    def get_target_network(self):
        return self.target_network

class TargetCriticNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dims=[400, 300], tau=0.001):
        super(TargetCriticNetwork, self).__init__()
        
        self.tau = tau
        self.online_network = CriticNetwork(state_dim, action_dim, hidden_dims)
        self.target_network = copy.deepcopy(self.online_network)
        for param in self.target_network.parameters():
            param.requires_grad = False
            
    def forward(self, state, action):
        return self.target_network(state, action)
    
    def soft_update(self):
        for target_param, online_param in zip(self.target_network.parameters(), 
                                            self.online_network.parameters()):
            target_param.data.copy_(
                self.tau * online_param.data + (1.0 - self.tau) * target_param.data
            )
    
    def get_online_network(self):
        return self.online_network
    
    def get_target_network(self):
        return self.target_network
    
class ReplayBuffer:
    def __init__(self, capacity, state_dim):
        self.buffer = deque(maxlen=capacity)
        self.state_dim = state_dim 
        
    def push(self, state, action, reward, next_state):
        state = np.squeeze(np.array(state))
        next_state = np.squeeze(np.array(next_state))
        assert state.shape == (self.state_dim,), f"State shape mismatch: {state.shape}"
        assert next_state.shape == (self.state_dim,), f"Next state shape mismatch: {next_state.shape}"
        
        self.buffer.append((state, action, reward, next_state))
    
    def __len__(self):
        return len(self.buffer)
        
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state = zip(*batch)

        state = np.array([np.squeeze(s) for s in state])
        next_state = np.array([np.squeeze(ns) for ns in next_state])
        
        action = np.array(action)
        reward = np.array(reward).reshape(-1, 1)
        
        assert state.shape == (batch_size, self.state_dim), f"State shape mismatch: {state.shape}"
        assert next_state.shape == (batch_size, self.state_dim), f"Next state shape mismatch: {next_state.shape}"
        assert action.shape[0] == batch_size, f"Action batch size mismatch: {action.shape[0]} != {batch_size}"
        assert reward.shape[0] == batch_size, f"Reward batch size mismatch: {reward.shape[0]} != {batch_size}"
        
        return (torch.FloatTensor(state), 
                torch.FloatTensor(action),
                torch.FloatTensor(reward), 
                torch.FloatTensor(next_state))

class FLDDPG:
    def __init__(self, state_dim, action_dim, hidden_dims=[400, 300], buffer_size=10000,
                 batch_size=16, gamma=0.95, tau=0.00001, actor_lr=1e-3, critic_lr=1e-3,
                 lr_decay_rate=0.995, min_lr=1e-6):
        
        #? Selecting GPU based on device
        if(torch.backends.mps.is_available()):
            self.device = torch.device("mps")
        else:
            self.device = torch.device("cuda")
        
        self.target_actor = TargetActorNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            hidden_dims=hidden_dims,
            tau=tau
        ).to(self.device)
        
        self.target_critic = TargetCriticNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            hidden_dims=hidden_dims,
            tau=tau
        ).to(self.device)
        
        self.actor = self.target_actor.get_online_network()
        self.critic = self.target_critic.get_online_network()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.gamma = gamma
        self.batch_size = batch_size
        self.replay_buffer = ReplayBuffer(buffer_size, state_dim)

        self.initial_actor_lr = actor_lr
        self.initial_critic_lr = critic_lr
        self.lr_decay_rate = lr_decay_rate
        self.min_lr = min_lr
        self.current_actor_lr = actor_lr
        self.current_critic_lr = critic_lr

        self.actor_scheduler = optim.lr_scheduler.ExponentialLR(
            optimizer=self.actor_optimizer, 
            gamma=lr_decay_rate
        )
        self.critic_scheduler = optim.lr_scheduler.ExponentialLR(
            optimizer=self.critic_optimizer, 
            gamma=lr_decay_rate
        )

    def decay_learning_rates(self):
        self.actor_scheduler.step()
        for param_group in self.actor_optimizer.param_groups:
            param_group['lr'] = max(param_group['lr'], self.min_lr)
        self.current_actor_lr = self.actor_optimizer.param_groups[0]['lr']
        self.critic_scheduler.step()
        for param_group in self.critic_optimizer.param_groups:
            param_group['lr'] = max(param_group['lr'], self.min_lr)
        self.current_critic_lr = self.critic_optimizer.param_groups[0]['lr']
        
    def select_action(self, state, explore, epsilon):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            if explore:
                action = self.actor.get_action(state)
                noise_scale = epsilon * np.random.normal(0, 1, size=action.shape)
                action = torch.clamp(action + torch.FloatTensor(noise_scale).to(self.device), -1, 1)
            else:
                action = self.actor(state)
        return action.cpu().numpy().squeeze()

    def update(self):
        if len(self.replay_buffer) < self.batch_size * 3:
            return 0, 0
        state_batch, action_batch, reward_batch, next_state_batch = self.replay_buffer.sample(self.batch_size)
        state_batch = state_batch.to(self.device)
        action_batch = action_batch.to(self.device)
        reward_batch = reward_batch.to(self.device)
        next_state_batch = next_state_batch.to(self.device)
        with torch.no_grad():
            next_actions = self.target_actor(next_state_batch)
            target_q = reward_batch + self.gamma * self.target_critic(next_state_batch, next_actions)
        
        current_q = self.critic(state_batch, action_batch)
        critic_loss = torch.nn.MSELoss()(current_q, target_q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0)
        self.critic_optimizer.step()

        actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean()
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=1.0)
        self.actor_optimizer.step()

        self.target_actor.soft_update()
        self.target_critic.soft_update()
        
        return actor_loss.item(), critic_loss.item()

I tried to see if the error was in the simulation end, but the values there seemed just as they were supposed to be. The flow of the program seems correct as well with the following training loop:

for episode in range(num_episodes):
            matlab_state = self.eng.reset(self.sim)
            state = self.process_state(matlab_state)
            episode_reward = 0
            episode_losses = {'actor': [], 'critic': []}

            initial_peb = float(self.eng.calculatePerformanceMetrics(self.sim))
            min_peb_in_episode = initial_peb
            peb_values_in_episode = [initial_peb]
            
            step_counter = 0
            
            for step in range(max_steps):
                value = np.random.uniform(0,1)
                if (value < epsilon_start):
                    explore = True
                else:
                    explore = False
                step_counter = step_counter + 1

                action = self.agent.select_action(state, explore, epsilon_start)

                matlab_action = matlab.double(action.tolist())
                
                next_matlab_state, reward, done = self.eng.step(self.sim, matlab_action, nargout=3)
                current_peb = float(self.eng.calculatePerformanceMetrics(self.sim))
                rate_values[episode].append(float(self.eng.getrate(self.sim)))

                peb_values_in_episode.append(current_peb)
                min_peb_in_episode = min(min_peb_in_episode, current_peb)

                next_state = self.process_state(next_matlab_state)
                done = bool(done)
                
                self.agent.replay_buffer.push(state, action, reward, next_state)
                actor_loss, critic_loss = self.agent.update()
                
                episode_reward += reward
                if actor_loss is not None:
                    episode_losses['actor'].append(actor_loss)
                    episode_losses['critic'].append(critic_loss)
                
                state = next_state
                epsilon_start = max(epsilon_en, epsilon_start*epsilon_decay_rate)
                
                if done:
                    prev_peb = current_peb
                    break
            
            episode_reward = episode_reward/step_counter

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - Is this a correct Implementation of Flexible DDPG? - Stack Overflow

与本文相关的文章

评论列表(0)