Cookbook

Practical examples for common training scenarios.

FSDP Training

Fully Sharded Data Parallel training with Transformers:

from peft import LoraConfig
import twinkle
from twinkle import DeviceMesh
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.model import TransformersModel
from twinkle.preprocessor import SelfCognitionProcessor

# FSDP with 4 shards, 2-way data parallel
device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

def train():
    dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition'))
    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
    dataset.map(SelfCognitionProcessor('Twinkle', 'ModelScope'))
    dataset.encode()
    
    dataloader = DataLoader(dataset=dataset, batch_size=8)
    model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
    
    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
    model.add_adapter_to_model('default', lora_config)
    model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
    model.set_lr_scheduler(
        scheduler_cls='CosineWarmupScheduler',
        num_warmup_steps=5,
        num_training_steps=len(dataloader)
    )
    
    for step, batch in enumerate(dataloader):
        model.forward_backward(inputs=batch)
        model.clip_grad_and_step()
    
    model.save('fsdp-checkpoint')

if __name__ == '__main__':
    train()

Run with:

torchrun --nproc_per_node=8 train.py

MoE Training

Training Mixture of Experts models:

from twinkle import DeviceMesh
from twinkle.model import TransformersModel

# Expert parallelism + FSDP
device_mesh = DeviceMesh.from_sizes(ep_size=2, fsdp_size=4)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

model = TransformersModel(model_id='ms://Qwen/Qwen3-30B-A3B')

Sequence Parallelism

For long context training:

device_mesh = DeviceMesh.from_sizes(sp_size=4, dp_size=2)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

model = TransformersModel(
    model_id='ms://Qwen/Qwen3.5-4B',
    sequence_parallel=True
)

GRPO Training

Group Relative Policy Optimization:

import twinkle
from twinkle import DeviceMesh, DeviceGroup
from twinkle.advantage import GRPOAdvantage
from twinkle.data_format import SamplingParams
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.metric import CompletionRewardMetric
from twinkle.model import TransformersModel
from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward
from twinkle.sampler import vLLMSampler

MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
NUM_GENERATIONS = 8

device_groups = [
    DeviceGroup(name='model', ranks=4, device_type='cuda'),
    DeviceGroup(name='sampler', ranks=4, device_type='cuda'),
]
model_mesh = DeviceMesh.from_sizes(world_size=4, dp_size=4)
sampler_mesh = DeviceMesh.from_sizes(world_size=4, dp_size=4)
twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_groups)

def train():
    # Dataset
    dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', split='train'))
    dataset.set_template('Template', model_id=MODEL_ID)
    dataset.encode(add_generation_prompt=True)
    dataloader = DataLoader(dataset=dataset, batch_size=16)
    
    # Model
    model = TransformersModel(
        model_id=MODEL_ID,
        remote_group='model',
        device_mesh=model_mesh
    )
    model.set_loss('GRPOLoss', epsilon=0.2)
    model.set_optimizer('AdamW', lr=1e-5)
    
    # Sampler
    sampler = vLLMSampler(
        model_id=MODEL_ID,
        device_mesh=sampler_mesh,
        remote_group='sampler'
    )
    
    # Reward and Advantage
    accuracy_reward = GSM8KAccuracyReward()
    format_reward = GSM8KFormatReward()
    advantage_fn = GRPOAdvantage()
    
    sampling_params = SamplingParams(max_tokens=4096, num_samples=1)
    
    for batch in dataloader:
        # Sample completions
        responses = sampler.sample(batch * NUM_GENERATIONS, sampling_params)
        
        # Compute rewards
        accuracy = accuracy_reward(responses)
        format_r = format_reward(responses)
        total_rewards = [a + f for a, f in zip(accuracy, format_r)]
        
        # Compute advantages
        advantages = advantage_fn(
            total_rewards,
            num_generations=NUM_GENERATIONS,
            scale='group'
        ).tolist()
        
        # Extract data
        inputs = [seq.new_input_feature for r in responses for seq in r.sequences]
        old_logps = [[lp[0][1] for lp in seq.logprobs] for r in responses for seq in r.sequences]
        
        # Train
        model.forward_backward(
            inputs=inputs,
            old_logps=old_logps,
            advantages=advantages
        )
        model.clip_grad_and_step()
    
    model.save('grpo-checkpoint')

if __name__ == '__main__':
    train()

GKD Training

Generalized Knowledge Distillation:

from twinkle.model import TransformersModel

# Teacher and student models
teacher = TransformersModel(model_id='ms://Qwen/Qwen3.5-72B', requires_grad=False)
student = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')

student.set_loss('GKDLoss', teacher=teacher, temperature=2.0)

for batch in dataloader:
    student.forward_backward(inputs=batch)
    student.clip_grad_and_step()

Megatron Training

Using Megatron backend for 3D parallelism:

from twinkle import DeviceMesh
from twinkle.model.megatron import MegatronModel

# Tensor + Pipeline + Data parallelism
device_mesh = DeviceMesh.from_sizes(tp_size=2, pp_size=2, dp_size=2)
twinkle.initialize(mode='ray', global_device_mesh=device_mesh)

model = MegatronModel(
    model_id='ms://Qwen/Qwen3.5-4B',
    device_mesh=device_mesh,
    mixed_precision='bf16'
)

model.add_adapter_to_model('default', lora_config)
model.set_optimizer('default', lr=1e-4)

for batch in dataloader:
    model.forward_backward(inputs=batch)
    model.clip_grad_and_step()

Custom Reward Function

Implementing domain-specific rewards:

from twinkle.reward.base import Reward
from typing import List

class MyCustomReward(Reward):
    def __call__(self, trajectories: List[dict], ground_truths: List[dict]) -> List[float]:
        rewards = []
        for traj in trajectories:
            # Extract completion
            messages = traj.get('messages', [])
            completion = ''
            for msg in reversed(messages):
                if msg.get('role') == 'assistant':
                    completion = msg.get('content', '')
                    break
            
            # Custom scoring logic
            score = self.compute_score(completion)
            rewards.append(score)
        
        return rewards
    
    def compute_score(self, completion: str) -> float:
        # Your scoring logic here
        return 1.0 if 'correct' in completion else 0.0

Using HuggingFace Models

Switch from ModelScope to HuggingFace:

# ModelScope
model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')

# HuggingFace
model = TransformersModel(model_id='hf://Qwen/Qwen3.5-4B')

NPU Support

Training on Ascend NPUs:

device_group = [
    DeviceGroup(name='default', ranks=8, device_type='npu')
]

twinkle.initialize(mode='local', groups=device_group)

See the Twinkle cookbook for more examples.

docs