Skip to content

general

AccuracyReward

Bases: BasePointWiseReward

Calculate accuracy (exact match rate) between generated content and reference answer.

This reward evaluates if the generated content matches the reference answer exactly. A score of 1.0 indicates an exact match, while 0.0 indicates no match.

Source code in rm_gallery/gallery/rm/general.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
@RewardRegistry.register("accuracy")
class AccuracyReward(BasePointWiseReward):
    """
    Calculate accuracy (exact match rate) between generated content and reference answer.

    This reward evaluates if the generated content matches the reference answer exactly.
    A score of 1.0 indicates an exact match, while 0.0 indicates no match.
    """

    name: str = Field(default="accuracy", description="Accuracy reward")

    def _evaluate(
        self, sample: DataSample, **kwargs
    ) -> RewardResult[RewardDimensionWithScore]:
        """
        Calculate accuracy score.

        Args:
            sample: Data sample containing generated content and reference answer

        Returns:
            RewardResult: Reward result containing accuracy score
        """
        generated = sample.output[0].answer.content.strip()
        reference = sample.output[0].answer.label.get("reference", "").strip()

        # Calculate accuracy (1.0 for exact match, 0.0 otherwise)
        accuracy = 1.0 if generated == reference else 0.0

        return RewardResult(
            name=self.name,
            details=[
                RewardDimensionWithScore(
                    name=self.name,
                    score=accuracy,
                    reason=f"Generated content {'matches' if accuracy == 1.0 else 'does not match'} reference exactly",
                )
            ],
            extra_data={
                "generated": generated,
                "reference": reference,
                "accuracy": accuracy,
            },
        )

F1ScoreReward

Bases: BasePointWiseReward

Calculate F1 score between generated content and reference answer at word level.

This reward computes precision, recall and F1 score by comparing word overlap between generated and reference texts. Uses configurable tokenizer to support multilingual content including Chinese and English.

Source code in rm_gallery/gallery/rm/general.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
@RewardRegistry.register("f1_score")
class F1ScoreReward(BasePointWiseReward):
    """
    Calculate F1 score between generated content and reference answer at word level.

    This reward computes precision, recall and F1 score by comparing word overlap
    between generated and reference texts. Uses configurable tokenizer to support
    multilingual content including Chinese and English.
    """

    name: str = Field(default="f1_score", description="F1 score reward")
    tokenizer_type: str = Field(
        default="tiktoken",
        description="Tokenizer type: 'tiktoken', 'jieba', or 'simple'",
    )
    encoding_name: str = Field(
        default="cl100k_base",
        description="Tiktoken encoding name (for tiktoken tokenizer)",
    )
    chinese_only: bool = Field(
        default=False,
        description="Whether to keep only Chinese characters (for jieba tokenizer)",
    )

    def __init__(self, **data):
        super().__init__(**data)
        # Initialize tokenizer
        self._tokenizer = get_tokenizer(
            tokenizer_type=self.tokenizer_type,
            encoding_name=self.encoding_name,
            chinese_only=self.chinese_only,
        )

    def _evaluate(
        self, sample: DataSample, **kwargs
    ) -> RewardResult[RewardDimensionWithScore]:
        """
        Calculate F1 score.

        Args:
            sample: Data sample containing generated content and reference answer

        Returns:
            RewardResult: Reward result containing F1 score
        """
        generated = sample.output[0].answer.content.strip()
        reference = sample.output[0].answer.label.get("reference", "").strip()

        # Tokenize using unified tokenizer
        generated_preprocessed = self._tokenizer.preprocess_text(
            generated, to_lower=True
        )
        reference_preprocessed = self._tokenizer.preprocess_text(
            reference, to_lower=True
        )

        generated_tokens = set(self._tokenizer.tokenize(generated_preprocessed))
        reference_tokens = set(self._tokenizer.tokenize(reference_preprocessed))

        # Calculate precision, recall and F1 score
        if not generated_tokens and not reference_tokens:
            precision = recall = f1 = 1.0
        elif not generated_tokens or not reference_tokens:
            precision = recall = f1 = 0.0
        else:
            intersection = generated_tokens.intersection(reference_tokens)
            precision = len(intersection) / len(generated_tokens)
            recall = len(intersection) / len(reference_tokens)
            f1 = (
                2 * precision * recall / (precision + recall)
                if (precision + recall) > 0
                else 0.0
            )

        return RewardResult(
            name=self.name,
            details=[
                RewardDimensionWithScore(
                    name=self.name,
                    score=f1,
                    reason=f"F1 score: {f1:.3f} (Precision: {precision:.3f}, Recall: {recall:.3f})",
                )
            ],
            extra_data={
                "f1_score": f1,
                "precision": precision,
                "recall": recall,
                "generated_tokens": list(generated_tokens),
                "reference_tokens": list(reference_tokens),
                "tokenizer_type": self.tokenizer_type,
                "tokenizer_name": self._tokenizer.name,
            },
        )

NumberAccuracyReward

Bases: BasePointWiseReward

Check numerical calculation accuracy by comparing numbers in generated vs reference content.

This reward verifies if the numbers in the generated content match the numbers in the reference content within a specified tolerance.

Source code in rm_gallery/gallery/rm/general.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
@RewardRegistry.register("number_accuracy")
class NumberAccuracyReward(BasePointWiseReward):
    """
    Check numerical calculation accuracy by comparing numbers in generated vs reference content.

    This reward verifies if the numbers in the generated content match
    the numbers in the reference content within a specified tolerance.
    """

    name: str = Field(default="number_accuracy", description="Number accuracy reward")
    tolerance: float = Field(default=1e-6, description="Numerical comparison tolerance")

    def _extract_numbers(self, text: str) -> List[float]:
        """Extract numbers from text"""
        # Match integers and floating point numbers
        number_pattern = r"-?\d+\.?\d*"
        numbers = re.findall(number_pattern, text)
        return [float(n) for n in numbers if n]

    def _evaluate(
        self, sample: DataSample, **kwargs
    ) -> RewardResult[RewardDimensionWithScore]:
        """
        Check numerical accuracy.

        Args:
            sample: Data sample containing numerical values

        Returns:
            RewardResult: Reward result containing numerical accuracy score
        """
        generated = sample.output[0].answer.content
        reference = sample.output[0].answer.label.get("reference", "")

        generated_numbers = self._extract_numbers(generated)
        reference_numbers = self._extract_numbers(reference)

        if not reference_numbers:
            return RewardResult(
                name=self.name,
                details=[
                    RewardDimensionWithScore(
                        name=self.name,
                        score=0.0,
                        reason="No reference numbers to compare",
                    )
                ],
                extra_data={
                    "generated_numbers": generated_numbers,
                    "reference_numbers": reference_numbers,
                },
            )

        if not generated_numbers:
            return RewardResult(
                name=self.name,
                details=[
                    RewardDimensionWithScore(
                        name=self.name,
                        score=0.0,
                        reason="No numbers found in generated content",
                    )
                ],
                extra_data={
                    "generated_numbers": generated_numbers,
                    "reference_numbers": reference_numbers,
                },
            )

        # Compare numbers (match in order)
        correct = 0
        total = min(len(generated_numbers), len(reference_numbers))

        for i in range(total):
            if abs(generated_numbers[i] - reference_numbers[i]) <= self.tolerance:
                correct += 1

        accuracy = correct / len(reference_numbers) if reference_numbers else 0.0

        return RewardResult(
            name=self.name,
            details=[
                RewardDimensionWithScore(
                    name=self.name,
                    score=accuracy,
                    reason=f"Number accuracy: {correct}/{len(reference_numbers)} numbers correct",
                )
            ],
            extra_data={
                "accuracy": accuracy,
                "correct_numbers": correct,
                "total_reference_numbers": len(reference_numbers),
                "generated_numbers": generated_numbers,
                "reference_numbers": reference_numbers,
            },
        )

RougeReward

Bases: BasePointWiseReward

ROUGE-L similarity evaluation using longest common subsequence

Source code in rm_gallery/gallery/rm/general.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
@RewardRegistry.register("rouge")
class RougeReward(BasePointWiseReward):
    """ROUGE-L similarity evaluation using longest common subsequence"""

    name: str = Field(default="rouge", description="ROUGE similarity reward")

    def _lcs_length(self, x: List[str], y: List[str]) -> int:
        """Calculate longest common subsequence length"""
        m, n = len(x), len(y)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if x[i - 1] == y[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1] + 1
                else:
                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

        return dp[m][n]

    def _evaluate(
        self, sample: DataSample, **kwargs
    ) -> RewardResult[RewardDimensionWithScore]:
        """
        Calculate ROUGE-L score

        Args:
            sample: Data sample containing generated content

        Returns:
            RewardResult: Reward result containing ROUGE-L score
        """
        generated = sample.output[0].answer.content.strip().lower()
        reference = sample.output[0].answer.label.get("reference", "").strip().lower()

        # Tokenization
        generated_tokens = generated.split()
        reference_tokens = reference.split()

        if not generated_tokens and not reference_tokens:
            rouge_l = 1.0
        elif not generated_tokens or not reference_tokens:
            rouge_l = 0.0
        else:
            # Calculate LCS length
            lcs_len = self._lcs_length(generated_tokens, reference_tokens)

            # Calculate ROUGE-L
            if len(generated_tokens) == 0 or len(reference_tokens) == 0:
                rouge_l = 0.0
            else:
                precision = lcs_len / len(generated_tokens)
                recall = lcs_len / len(reference_tokens)
                rouge_l = (
                    2 * precision * recall / (precision + recall)
                    if (precision + recall) > 0
                    else 0.0
                )

        return RewardResult(
            name=self.name,
            details=[
                RewardDimensionWithScore(
                    name=self.name,
                    score=rouge_l,
                    reason=f"ROUGE-L score: {rouge_l:.3f}",
                )
            ],
            extra_data={
                "rouge_l": rouge_l,
                "generated_length": len(generated_tokens),
                "reference_length": len(reference_tokens),
                "lcs_length": self._lcs_length(generated_tokens, reference_tokens)
                if generated_tokens and reference_tokens
                else 0,
            },
        )