Source code for trinity.buffer.operators.mappers.reward_shaping_mapper

from typing import Dict, List, Optional, Tuple

from trinity.buffer.operators import EXPERIENCE_OPERATORS, ExperienceOperator
from trinity.common.constants import OpType
from trinity.common.experience import Experience


[docs] @EXPERIENCE_OPERATORS.register_module("reward_shaping_mapper") class RewardShapingMapper(ExperienceOperator): """Re-shaping the existing rewards of experiences based on rules or other advanced methods. Note: This mapper assumes that the reward is already calculated and stored in the Experience object, and the necessary stats are already calculated and stored in the Experience info field. """
[docs] def __init__(self, reward_shaping_configs: Optional[List[Dict]] = None): """Initializes the RewardShapingMapper. Args: reward_shaping_configs (list[dict], optional): A list of dictionaries containing reward shaping configurations. Each dictionary should include the following keys: - stats_key (str): The field key name of target stats used to shape the reward. - op_type (str): The type of operator to apply between the reward and the target stats. Should be one of {"ADD", "SUB", "MUL", "DIV"}. - weight (float): The weight for the target stats. Example: [ { "stats_key": "llm_quality_score", "op_type": "ADD", "weight": 1.0, } ] """ if reward_shaping_configs is None: reward_shaping_configs = [] self.reward_shaping_configs = reward_shaping_configs
[docs] def process(self, exps: List[Experience]) -> Tuple[List[Experience], Dict]: res_exps = [] reward_diff = [] for exp in exps: # skip experiences that don't have reward if exp.reward is None: res_exps.append(exp) reward_diff.append(0.0) continue res_exp = exp previous_reward = exp.reward for reward_shaping_config in self.reward_shaping_configs: res_exp = self._reward_shaping_single(res_exp, reward_shaping_config) if res_exp.reward is None: res_exps.append(exp) reward_diff.append(0.0) continue res_reward = res_exp.reward reward_diff.append(res_reward - previous_reward) res_exps.append(res_exp) if len(reward_diff) == 0: return res_exps, { "reward_diff/mean": 0, "reward_diff/min": 0, "reward_diff/max": 0, } metrics = { "reward_diff/mean": 1.0 * sum(reward_diff) / len(reward_diff), "reward_diff/min": min(reward_diff), "reward_diff/max": max(reward_diff), } return res_exps, metrics
def _reward_shaping_single(self, exp: Experience, reward_shaping_config: Dict): """Re-shapes the existing reward of one experience based on the given reward_shaping_config. Args: exp (Experience): The experience object whose reward is to be reshaped. reward_shaping_config (dict): A dictionary containing the reward shaping configuration. It should include the following keys: - stats_key (str): The field key name of target stats used to shape the reward. - op_type (str): The type of operator to apply between the reward and the target stats. Should be one of {"ADD", "SUB", "MUL", "DIV"}. - weight (float): The weight for the target stats. Returns: Experience: The experience object with the reshaped reward. """ tgt_stats = reward_shaping_config.get("stats_key", None) op_type = OpType[reward_shaping_config.get("op_type", "ADD")] weight = reward_shaping_config.get("weight", 1.0) # if the target stats is not specified, skip the stats and return the original experience if tgt_stats is None: return exp exp_info = exp.info if exp_info is None or len(exp_info) == 0: return exp # if the target stats does not exist in the exp info, skip the stats and return the original experience if tgt_stats not in exp_info: return exp if op_type == OpType.ADD: exp.reward += weight * exp_info[tgt_stats] elif op_type == OpType.MUL: exp.reward *= weight * exp_info[tgt_stats] elif op_type == OpType.SUB: exp.reward -= weight * exp_info[tgt_stats] elif op_type == OpType.DIV: divisor = weight * exp_info[tgt_stats] if divisor != 0: exp.reward /= divisor return exp