Benchmark practices
1. Overview
In this guide, we will show the gallery's pipeline on built-in reward benchmark: RewardBench2 and RMB Bench.
2. Setup
import sys
import os
sys.path.append("../../..")
os.environ["OPENAI_API_KEY"] = ""
os.environ["BASE_URL"] = ""
3. RewardBench2
RewardBench2 implements a category-based routing system for specialized reward models. It supports the following categories: - Safety (toxicity detection) - Focus (content relevance assessment) - Math (mathematical reasoning evaluation) - Factuality (truthfulness verification) - Precise IF (instruction following capability assessment) - General helpfulness (default fallback)
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, Type
from rm_gallery.core.data.load.base import create_loader
from rm_gallery.core.data.schema import DataSample
from rm_gallery.core.model.openai_llm import OpenaiLLM
# Implementation by creating base class
from rm_gallery.core.reward.base import BaseReward
from rm_gallery.core.reward.composition import RouterComposition
from rm_gallery.core.utils.acc import calc_acc
from rm_gallery.gallery.rm.alignment.base import BaseHelpfulnessListWiseReward
from rm_gallery.gallery.rm.alignment.harmlessness.safety import SafetyListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.focus import FocusListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.math import MathListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.precise_if import PreciseIFListWiseReward
from rm_gallery.gallery.rm.alignment.honesty.factuality import FactualityListWiseReward
# Configure local file loading parameters
config = {
"path": "./data/reward-bench-2/data/test-00000-of-00001.parquet",
"limit": 10, # Limit the number of data items to load
}
# Create loading module
load_module = create_loader(
name="rewardbench2",
load_strategy_type="local",
data_source="rewardbench2",
config=config
)
dataset = load_module.run()
# Define router
class RewardBench2Router(RouterComposition):
rewards: Dict[str, Type[BaseReward]] = {
"safety": SafetyListWiseReward,
"focus": FocusListWiseReward,
"math": MathListWiseReward,
"factuality": FactualityListWiseReward,
"precis_if": PreciseIFListWiseReward,
"general": BaseHelpfulnessListWiseReward,
}
def _condition(self, sample: DataSample) -> str:
# Extract third-level category from path and normalize to lowercase
# Example: "Safety/Content/Toxicity" -> "toxicity"
try:
cond = sample.metadata["raw_data"]["subset"].lower()
except Exception:
# Fallback to general reward model when path extraction fails
cond = "general"
if cond not in self.rewards:
cond = "general"
return cond
# Initialize router
router = RewardBench2Router(
name="reward-bench-2-router",
params={
"llm": OpenaiLLM(model="qwen3-235b-a22b", enable_thinking=True),
}
)
# Process each sample through the appropriate reward model
results = router.evaluate_batch(dataset.datasamples, max_workers=128)
print(f"Processed {len(results)} samples with RewardBench2")
print(f"Accuracy: {calc_acc(results)}")
3. RMBBench
RMBBench provides task-type specific reward modeling for diverse NLP tasks including: - Brainstorming quality assessment - Chat response evaluation - Classification accuracy scoring - Code generation quality assessment - Content generation evaluation - Open QA and closed QA assessment - Reasoning capability evaluation - Text rewriting quality - Role-playing performance - Summarization effectiveness - Translation quality - General helpfulness (default fallback)
from rm_gallery.core.data.load.base import create_loader
from rm_gallery.gallery.rm.alignment.helpfulness.brainstorming import BrainstormingListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.chat import ChatListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.classification import ClassificationListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.closed_qa import ClosedQAListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.code import CodeListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.generation import GenerationListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.open_qa import OpenQAListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.reasoning import ReasoningListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.rewrite import RewriteListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.role_playing import RolePlayingListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.summarization import SummarizationListWiseReward
from rm_gallery.gallery.rm.alignment.helpfulness.translation import TranslationListWiseReward
# Configure local file loading parameters
config = {
"path": "./data/reward-bench-2/data/test-00000-of-00001.parquet",
"limit": 1000, # Limit the number of data items to load
}
# Create data loader
loader = create_loader(
name="rewardbench2", # Dataset name
load_strategy_type="local", # Use local file loading strategy
data_source="rewardbench2", # Specify data source format converter
config=config # Pass configuration parameters
)
# Execute data loading
dataset = loader.run()
# Define router
class RMBBenchRouter(RouterComposition):
rewards: Dict[str, Type[BaseReward]] = {
"brainstorming": BrainstormingListWiseReward,
"chat": ChatListWiseReward,
"classification": ClassificationListWiseReward,
"closed_qa": ClosedQAListWiseReward,
"code": CodeListWiseReward,
"generation": GenerationListWiseReward,
"open_qa": OpenQAListWiseReward,
"reasoning": ReasoningListWiseReward,
"rewrite": RewriteListWiseReward,
"role_playing": RolePlayingListWiseReward,
"summarization": SummarizationListWiseReward,
"translation": TranslationListWiseReward,
"general": BaseHelpfulnessListWiseReward,
}
def _condition(self, sample: DataSample) -> str:
try:
cond = sample["meta"]["category_path"].split("/")[-2].lower()
except Exception:
# Fallback to general reward model when path extraction fails
cond = "general"
if cond not in self.rewards:
cond = "general"
return cond
# Initialize router
rmb_router = RMBBenchRouter(
name="rmb-bench-router",
params={
"llm": OpenaiLLM(model="qwen3-235b-a22b", enable_thinking=True),
}
)
# Process samples with automatic task detection
results = rmb_router.evaluate(dataset.datasamples)
print(f"Processed {len(results)} samples with RewardBench2")
print(f"Accuracy: {calc_acc(results)}")