te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>python - PPO Model Fails to Maintain Learned Process Despite Increasing Explained Variance - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - PPO Model Fails to Maintain Learned Process Despite Increasing Explained Variance - Stack Overflow

programmeradmin1浏览0评论

I'm working on a custom Gymnasium environment using Stable-Baselines3’s PPO. My task is to have an agent keep its yaw error as close to zero as possible. In training, it initially learns to keep the yaw aligned, but after a while, the yaw error starts to drift (sometimes increasing, sometimes decreasing) even though the training logs (such as the increasing explained_variance) seem to indicate that learning is progressing. No matter how I adjust the reward calculation or tweak the model parameters, I can’t get the agent to consistently maintain the correct yaw.

Below is a simplified version of my environment’s step() and calculate_reward() functions, along with the PPO model parameters I’ve experimented with:

def step(self, action, train=True):
    self.current_step += 1
    
    # Scale action as before
    done = self.agent.step(action * 25)

    # Compute reward components
    reward, reward_dict, distance_error, yaw_error = self.calculate_reward(done)
    
    # Reward difference: using the delta between current and previous rewards
    prev_reward = reward
    reward = reward - self.prev_reward
    self.prev_reward = prev_reward
    
    # Construct the state: various agent kinematics and sensor readings
    state = np.concatenate([
        self.agent.nu[:3], 
        np.array([self.agent.eta[-1], self.agent.ref[-1], distance_error, yaw_error]),
        self.agent.prop_f, 
        self.agent.prop_r, 
        self.agent.prop_m
    ])
    
    self.info = {
        "reward": reward,
        "reward_dict": reward_dict,
        "prop_f": np.mean(self.agent.prop_f),
        "prop_m": np.mean(self.agent.prop_m),
        "distance_error": distance_error,
        "yaw_error": yaw_error,
        "pos_x": [self.agent.eta[0], self.agent.ref[0]],
        "pos_y": [self.agent.eta[1], self.agent.ref[1]],
        "pos_z": self.agent.eta[2],
    }

    if not train:
        self._render()
        print("\nStep:", self.current_step)
        for k, y in self.info.items():
            if k != "reward_dict":
                print(f"{k}{' '*(20-len(k))}: {y}")
        for k, y in self.info["reward_dict"].items():
            print(f"{k}{' '*(20-len(k))}: {y}")
    
    terminated = done == 1  # Episode ended due to task completion
    truncated = self.current_step >= self.config['max_steps_per_ep']  # Time limit
    
    return state, reward, terminated, truncated, self.info

def calculate_reward(self, done):
    # Get current yaw and compute the rotation matrix to transform velocities
    current_yaw = self.agent.eta[5]
    rotation_matrix = np.array([
        [np.cos(current_yaw), -np.sin(current_yaw), 0],
        [np.sin(current_yaw), np.cos(current_yaw), 0],
        [0, 0, 1]
    ])
    
    # Convert body-fixed velocities to global frame
    global_vel = rotation_matrix @ self.agent.nu[:3]
    
    vehicle_pos = np.array(self.agent.eta[:3])
    ref_pos = np.array(self.agent.ref[:3])
    direction_to_target = ref_pos - vehicle_pos
    distance = np.linalg.norm(direction_to_target)
    
    # 1. Directional Progress (Global Frame)
    target_direction = direction_to_target / (distance + 1e-6)
    velocity_toward_target = np.dot(global_vel, target_direction)
    progress_reward = velocity_toward_target / 2.5  # Normalized by target speed
    
    # 2. Yaw Alignment (Body-Fixed Frame)
    desired_yaw = np.arctan2(direction_to_target[1], direction_to_target[0])
    yaw_error = np.arctan2(np.sin(desired_yaw - current_yaw),
                           np.cos(desired_yaw - current_yaw))
    yaw_reward = np.exp(-2 * yaw_error**2)  # Gaussian reward based on yaw error
    
    # 6. Terminal Conditions
    terminal_reward = 0
    if done == 1:
        terminal_reward = 5 + 5.0 * (1 - self.current_step / self.config['max_steps_per_ep'])
    elif self.current_step >= self.config['max_steps_per_ep']:
        terminal_reward = -2 * distance
    
    # Combined Reward
    reward = (
        0.1 * progress_reward +
        1.0 * (yaw_reward - 0.7) +
        terminal_reward
    )
    
    # Dynamic Weight Adjustment when close to the target
    if distance < 5:
        reward += 0.1 * yaw_reward  # Emphasize precise alignment near target
        
    reward_dict = {
        'progress': progress_reward,
        'yaw': yaw_reward,
        'terminal': terminal_reward,
        'total': reward
    }
    
    return reward, reward_dict, distance, abs(yaw_error)
  • agent.nu: Represents the body-fixed velocity vector (e.g., [forward_velocity, lateral_velocity, vertical_velocity, ...]). This is used to calculate the vehicle’s movement and is transformed into the global frame using the current yaw.

  • agent.eta: This typically holds the pose information of the agent (position and orientation). In this example: agent.eta[:3] gives the current position (x, y, z). agent.eta[5] or agent.eta[-1] is used as the current yaw.

  • agent.ref: Represents the reference or target pose. agent.ref[:3] is the target position.

  • agent.prop_f, agent.prop_r, agent.prop_m: These are actuator or propeller signals: prop_f represents the front propeller's force. prop_r represents the rear propeller's force. prop_m represents the middle propeller's force. They are included in the state vector to provide the agent with information about its current actuation status. Action taken changes the propellers.

Model parameters used (subject to change):

"model_params": {
    "tensorboard_log": "models/tensorboard_logs/",
    "policy": "MlpPolicy",
    "learning_rate": 0.0003,
    "policy_kwargs": {
        "net_arch": {
            "pi": [128, 64],
            "vf": [128, 64]
        },
        "activation_fn": "Tanh",
        "ortho_init": true,
        "optimizer_kwargs": {
            "eps": 1e-05
        },
        "log_std_init": -0.9
    },
    "vf_coef": 0.5,
    "ent_coef": 0.0,
    "max_grad_norm": 0.5,
    "clip_range": 0.2,
    "clip_range_vf": "",
    "n_epochs": 10,
    "target_kl": 0.05,
    "gae_lambda": 0.95,
    "verbose": 2,
    "device": "cpu"
}

Has anyone encountered similar issues where a PPO agent initially learns the task but later “fets” or destabilizes its performance? Specifically:

  • Could the differential reward (subtracting the previous reward) be a source of instability?

  • How might one better balance multiple reward components (yaw alignment vs. progress vs. terminal rewards) to ensure the agent consistently maintains low yaw error?

  • Are there any tips for tuning PPO hyperparameters in such a mixed-reward scenario?

Any advice or pointers to resources on reward shaping in PPO would be greatly appreciated!

发布评论

评论列表(0)

  1. 暂无评论