trinity.algorithm.algorithm module

目录

trinity.algorithm.algorithm module#

Algorithm classes.

class trinity.algorithm.algorithm.ConstantMeta(name, bases, namespace, /, **kwargs)[源代码]#

基类:ABCMeta

class trinity.algorithm.algorithm.AlgorithmType[源代码]#

基类:ABC

use_critic: bool#
use_reference: bool#
compute_advantage_in_trainer: bool#
can_balance_batch: bool#
schema: str#
abstractmethod classmethod default_config() Dict[源代码]#
classmethod name() str[源代码]#
classmethod check_config(config: Config) None[源代码]#
class trinity.algorithm.algorithm.SFTAlgorithm[源代码]#

基类:AlgorithmType

SFT Algorithm.

use_critic: bool = False#
use_reference: bool = False#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'sft'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.PPOAlgorithm[源代码]#

基类:AlgorithmType

PPO Algorithm.

use_critic: bool = True#
use_reference: bool = True#
compute_advantage_in_trainer: bool = True#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.GRPOAlgorithm[源代码]#

基类:AlgorithmType

GRPO algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.ReinforcePlusPlusAlgorithm[源代码]#

基类:AlgorithmType

Reinforce++ algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = True#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.RLOOAlgorithm[源代码]#

基类:AlgorithmType

RLOO algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = True#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.OPMDAlgorithm[源代码]#

基类:AlgorithmType

OPMD algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.AsymREAlgorithm[源代码]#

基类:AlgorithmType

AsymRE algorithm.

use_critic: bool = False#
use_reference: bool = False#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.DPOAlgorithm[源代码]#

基类:AlgorithmType

DPO algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = False#
schema: str = 'dpo'#
classmethod default_config() Dict[源代码]#
classmethod check_config(config: Config) None[源代码]#
class trinity.algorithm.algorithm.TOPRAlgorithm[源代码]#

基类:AlgorithmType

TOPR algorithm. See https://arxiv.org/pdf/2503.14286v1

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.CISPOAlgorithm[源代码]#

基类:AlgorithmType

CISPO algorithm. See https://arxiv.org/abs/2506.13585

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.GSPOAlgorithm[源代码]#

基类:AlgorithmType

GSPO algorithm. See https://arxiv.org/pdf/2507.18071

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.SAPOAlgorithm[源代码]#

基类:AlgorithmType

SAPO (Soft Adaptive Policy Optimization) algorithm.

SAPO uses a smooth, temperature-controlled soft gate instead of hard clipping to stabilize training while maintaining effective learning.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.MIXAlgorithm[源代码]#

基类:AlgorithmType

MIX algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
use_rollout: bool = True#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.MIXCHORDAlgorithm[源代码]#

基类:AlgorithmType

MIX algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
use_rollout: bool = True#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.RAFTAlgorithm[源代码]#

基类:AlgorithmType

RAFT Algorithm. This algorithm is conceptually similar to Supervised Fine-Tuning (SFT) but is designed to work with experience schema from rollouts.

use_critic: bool = False#
use_reference: bool = False#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.sPPOAlgorithm[源代码]#

基类:AlgorithmType

sPPO Algorithm.

use_critic: bool = False#
use_reference: bool = False#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.RECAlgorithm[源代码]#

基类:AlgorithmType

REC Algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.MultiStepGRPOAlgorithm[源代码]#

基类:AlgorithmType

Multi-Step GRPO Algorithm.

use_critic: bool = False#
use_reference: bool = True#
compute_advantage_in_trainer: bool = False#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#
class trinity.algorithm.algorithm.OnPolicyDistillAlgorithm[源代码]#

基类:AlgorithmType

On-Policy Distillation Algorithm.

Reference: Tinker library.

Workflow stores teacher_logprobs in experience.info["teacher_logprobs"]. Trainer's advantage_fn computes: advantages = teacher_logprobs - student_logprobs Trainer uses:

importance_sampling loss if no clipping is needed ppo loss if clipping is needed, for better stability

use_critic: bool = False#
use_reference: bool = False#
compute_advantage_in_trainer: bool = True#
can_balance_batch: bool = True#
schema: str = 'experience'#
classmethod default_config() Dict[源代码]#