trinity.algorithm.algorithm module

trinity.algorithm.algorithm module#

Algorithm classes.

class trinity.algorithm.algorithm.ConstantMeta(name, bases, namespace, /, **kwargs)[源代码]#: 基类：ABCMeta

class trinity.algorithm.algorithm.AlgorithmType[源代码]#

基类：ABC

use_critic: bool#

use_reference: bool#

compute_advantage_in_trainer: bool#

can_balance_batch: bool#

schema: str#

abstractmethod classmethod default_config() → Dict[源代码]#

classmethod name() → str[源代码]#

classmethod check_config(config: Config) → None[源代码]#

class trinity.algorithm.algorithm.SFTAlgorithm[源代码]#

基类：AlgorithmType

SFT Algorithm.

use_critic: bool = False#

use_reference: bool = False#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'sft'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.PPOAlgorithm[源代码]#

基类：AlgorithmType

PPO Algorithm.

use_critic: bool = True#

use_reference: bool = True#

compute_advantage_in_trainer: bool = True#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.GRPOAlgorithm[源代码]#

基类：AlgorithmType

GRPO algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.ReinforcePlusPlusAlgorithm[源代码]#

基类：AlgorithmType

Reinforce++ algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = True#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.RLOOAlgorithm[源代码]#

基类：AlgorithmType

RLOO algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = True#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.OPMDAlgorithm[源代码]#

基类：AlgorithmType

OPMD algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.AsymREAlgorithm[源代码]#

基类：AlgorithmType

AsymRE algorithm.

use_critic: bool = False#

use_reference: bool = False#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.DPOAlgorithm[源代码]#

基类：AlgorithmType

DPO algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = False#

schema: str = 'dpo'#

classmethod default_config() → Dict[源代码]#

classmethod check_config(config: Config) → None[源代码]#

class trinity.algorithm.algorithm.TOPRAlgorithm[源代码]#

基类：AlgorithmType

TOPR algorithm. See https://arxiv.org/pdf/2503.14286v1

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.CISPOAlgorithm[源代码]#

基类：AlgorithmType

CISPO algorithm. See https://arxiv.org/abs/2506.13585

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.GSPOAlgorithm[源代码]#

基类：AlgorithmType

GSPO algorithm. See https://arxiv.org/pdf/2507.18071

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.SAPOAlgorithm[源代码]#

基类：AlgorithmType

SAPO (Soft Adaptive Policy Optimization) algorithm.

SAPO uses a smooth, temperature-controlled soft gate instead of hard clipping to stabilize training while maintaining effective learning.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.MIXAlgorithm[源代码]#

基类：AlgorithmType

MIX algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

use_rollout: bool = True#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.MIXCHORDAlgorithm[源代码]#

基类：AlgorithmType

MIX algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

use_rollout: bool = True#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.RAFTAlgorithm[源代码]#

基类：AlgorithmType

RAFT Algorithm. This algorithm is conceptually similar to Supervised Fine-Tuning (SFT) but is designed to work with experience schema from rollouts.

use_critic: bool = False#

use_reference: bool = False#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.sPPOAlgorithm[源代码]#

基类：AlgorithmType

sPPO Algorithm.

use_critic: bool = False#

use_reference: bool = False#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.RECAlgorithm[源代码]#

基类：AlgorithmType

REC Algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.MultiStepGRPOAlgorithm[源代码]#

基类：AlgorithmType

Multi-Step GRPO Algorithm.

use_critic: bool = False#

use_reference: bool = True#

compute_advantage_in_trainer: bool = False#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

class trinity.algorithm.algorithm.OnPolicyDistillAlgorithm[源代码]#

基类：AlgorithmType

On-Policy Distillation Algorithm.

Reference: Tinker library.

Workflow stores teacher_logprobs in experience.info["teacher_logprobs"]. Trainer's advantage_fn computes: advantages = teacher_logprobs - student_logprobs Trainer uses:

importance_sampling loss if no clipping is needed ppo loss if clipping is needed, for better stability

use_critic: bool = False#

use_reference: bool = False#

compute_advantage_in_trainer: bool = True#

can_balance_batch: bool = True#

schema: str = 'experience'#

classmethod default_config() → Dict[源代码]#

trinity.algorithm.algorithm module

目录

trinity.algorithm.algorithm module#