Skip to content

rewardbench2

RewardBench2AnnotationTemplate

Bases: BaseAnnotationTemplate

Reward Bench 2 annotation template implementation for 4-way comparison

Source code in rm_gallery/gallery/data/annotation/rewardbench2.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
@AnnotationTemplateRegistry.register("rewardbench2")
class RewardBench2AnnotationTemplate(BaseAnnotationTemplate):
    """Reward Bench 2 annotation template implementation for 4-way comparison"""

    def __init__(self, name: str):
        super().__init__(name)

    @property
    def label_config(self) -> str:
        """Return the Label Studio XML configuration for reward bench 2 evaluation (4-way comparison)"""
        return """
<View>
  <!-- Sample Information -->
  <Header value="Sample Information"/>
  <Text name="unique_id" value="$unique_id" title="Unique ID"/>
  <Text name="source" value="$source" title="Source"/>
  <Text name="task_category" value="$task_category" title="task_category"/>
  <Text name="created_at" value="$created_at" title="Created At"/>
  <Text name="answer_count" value="$answer_count" title="Number of Answers"/>

  <!-- Input Messages -->
  <Header value="Input Messages"/>
  <Paragraphs name="input_dialogue" value="$input_messages" layout="dialogue" nameKey="role" textKey="content" />

  <!-- Output Responses -->
  <Header value="Output Responses"/>
  <Paragraphs name="output_dialogue" value="$output_messages" layout="dialogue" nameKey="role" textKey="content" />

  <!-- Step 1: Best Answer Selection -->
  <View>
    <Text name="step1_title" value="Step 1: Best Answer Selection" />
    <Text name="step1_desc1" value="Please select the best answer among the 4 options" />
    <Choices name="best_answer" toName="output_dialogue" choice="single" title="🏆 Best Answer">
      <Choice value="answer_1" showIf="$answer_count>=1"/>
      <Choice value="answer_2" showIf="$answer_count>=2"/>
      <Choice value="answer_3" showIf="$answer_count>=3"/>
      <Choice value="answer_4" showIf="$answer_count>=4"/>
      <Choice value="all_equal" showIf="$answer_count=4"/>
    </Choices>
  </View>

  <!-- Step 2: Answer Ranking -->
  <View>
    <Text name="step2_spacer" value="" />
    <Text name="step2_title" value="Step 2: Answer Ranking" />
    <Text name="step2_desc" value="Please rank all answers from best to worst (1=best, 4=worst)" />

    <Text name="answer1_rank_label" value="📝 Answer 1 Rank:" />
    <Choices name="answer1_rank" toName="output_dialogue" choice="single" title="Answer 1 Rank">
      <Choice value="1"/>
      <Choice value="2"/>
      <Choice value="3"/>
      <Choice value="4"/>
    </Choices>

    <Text name="answer2_rank_label" value="📝 Answer 2 Rank:" />
    <Choices name="answer2_rank" toName="output_dialogue" choice="single" title="Answer 2 Rank">
      <Choice value="1"/>
      <Choice value="2"/>
      <Choice value="3"/>
      <Choice value="4"/>
    </Choices>

    <Text name="answer3_rank_label" value="📝 Answer 3 Rank:" />
    <Choices name="answer3_rank" toName="output_dialogue" choice="single" title="Answer 3 Rank">
      <Choice value="1"/>
      <Choice value="2"/>
      <Choice value="3"/>
      <Choice value="4"/>
    </Choices>

    <Text name="answer4_rank_label" value="📝 Answer 4 Rank:" />
    <Choices name="answer4_rank" toName="output_dialogue" choice="single" title="Answer 4 Rank">
      <Choice value="1"/>
      <Choice value="2"/>
      <Choice value="3"/>
      <Choice value="4"/>
    </Choices>
  </View>

  <!-- Step 3: Answer Rating -->
  <View>
    <Text name="step3_spacer" value="" />
    <Text name="step3_title" value="Step 3: Answer Rating" />
    <Text name="step3_desc" value="Please rate the quality of each answer for the $task_category task_category (1-5 stars)" />

    <Text name="answer1_rating_label" value="📝 Answer 1 Rating:" />
    <Rating name="answer1_rating" toName="output_dialogue" maxRating="5" icon="star" size="medium" title="Answer 1 Quality Rating"/>

    <Text name="answer2_rating_label" value="📝 Answer 2 Rating:" />
    <Rating name="answer2_rating" toName="output_dialogue" maxRating="5" icon="star" size="medium" title="Answer 2 Quality Rating"/>

    <Text name="answer3_rating_label" value="📝 Answer 3 Rating:" />
    <Rating name="answer3_rating" toName="output_dialogue" maxRating="5" icon="star" size="medium" title="Answer 3 Quality Rating"/>

    <Text name="answer4_rating_label" value="📝 Answer 4 Rating:" />
    <Rating name="answer4_rating" toName="output_dialogue" maxRating="5" icon="star" size="medium" title="Answer 4 Quality Rating"/>

    <Text name="rating_criteria" value="💡 Rating Criteria: 5 stars = excellent, 4 stars = good, 3 stars = average, 2 stars = poor, 1 star = very poor" />
  </View>

  <!-- Step 4: Additional Comments -->
  <View>
    <Text name="step4_spacer" value="" />
    <Text name="step4_title" value="Step 4: Additional Comments" />
    <Text name="step4_desc" value="Please provide any additional comments or feedback" />
    <TextArea name="additional_comments" toName="output_dialogue" placeholder="[x] The x-th answer has the following issues..." title="Additional Comments"/>
  </View>

</View>
"""

    def process_annotations(self, annotation_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process annotation data specific to reward bench 2 evaluation (4-way comparison)

        Args:
            annotation_data: Generic annotation data with ratings, choices, text_areas

        Returns:
            Processed data structured for reward bench 2 evaluation
        """
        processed = {
            "best_answer": None,
            "answer_rankings": {},
            "answer_ratings": {},
            "ranking_order": [],
            "quality_comparison": {},
            "comments": "",
            "preference": None,
        }

        # Extract best answer selection (Step 1)
        if "best_answer" in annotation_data.get("choices", {}):
            best_answer_choices = annotation_data["choices"]["best_answer"]["choices"]
            if best_answer_choices:
                processed["best_answer"] = best_answer_choices[0]
                processed["preference"] = best_answer_choices[0]

        # Extract answer rankings (Step 2)
        choices = annotation_data.get("choices", {})
        rank_keys = ["answer1_rank", "answer2_rank", "answer3_rank", "answer4_rank"]

        for i, rank_key in enumerate(rank_keys, 1):
            if rank_key in choices:
                rank_choices = choices[rank_key]["choices"]
                if rank_choices:
                    processed["answer_rankings"][f"answer_{i}"] = int(rank_choices[0])

        # Create ranking order based on ranks
        if processed["answer_rankings"]:
            # Sort answers by their rank (1=best, 4=worst)
            sorted_answers = sorted(
                processed["answer_rankings"].items(), key=lambda x: x[1]
            )
            processed["ranking_order"] = [answer for answer, rank in sorted_answers]

        # Extract answer ratings (Step 3)
        ratings = annotation_data.get("ratings", {})
        rating_keys = [
            "answer1_rating",
            "answer2_rating",
            "answer3_rating",
            "answer4_rating",
        ]

        for i, rating_key in enumerate(rating_keys, 1):
            if rating_key in ratings:
                processed["answer_ratings"][f"answer_{i}"] = ratings[rating_key][
                    "rating"
                ]

        # Calculate quality comparison
        if processed["answer_ratings"]:
            # Find the highest rated answer
            best_rated_answer = max(
                processed["answer_ratings"].items(), key=lambda x: x[1]
            )

            # Calculate average rating
            avg_rating = sum(processed["answer_ratings"].values()) / len(
                processed["answer_ratings"]
            )

            processed["quality_comparison"] = {
                "best_rated_answer": best_rated_answer[0],
                "best_rating": best_rated_answer[1],
                "average_rating": avg_rating,
                "rating_spread": max(processed["answer_ratings"].values())
                - min(processed["answer_ratings"].values()),
                "consistency_check": {
                    "best_answer_matches_best_rating": processed["best_answer"]
                    == best_rated_answer[0],
                    "best_answer_matches_rank_1": processed["best_answer"]
                    in [
                        answer
                        for answer, rank in processed["answer_rankings"].items()
                        if rank == 1
                    ]
                    if processed["answer_rankings"]
                    else False,
                },
            }

        # Extract additional comments (Step 4)
        if "additional_comments" in annotation_data.get("text_areas", {}):
            processed["comments"] = annotation_data["text_areas"][
                "additional_comments"
            ]["text"]

        return processed

    def validate_annotation_data(self, annotation_data: Dict[str, Any]) -> bool:
        """
        Validate annotation data for reward bench 2 evaluation

        Args:
            annotation_data: Annotation data to validate

        Returns:
            True if valid, False otherwise
        """
        # Check if required fields are present
        required_sections = ["choices", "ratings"]
        for section in required_sections:
            if section not in annotation_data:
                return False

        # Check if best answer is selected
        if "best_answer" not in annotation_data.get("choices", {}):
            return False

        # Check if at least some rankings are provided
        choices = annotation_data.get("choices", {})
        rank_keys = ["answer1_rank", "answer2_rank", "answer3_rank", "answer4_rank"]
        if not any(key in choices for key in rank_keys):
            return False

        # Check if at least some ratings are provided
        ratings = annotation_data.get("ratings", {})
        rating_keys = [
            "answer1_rating",
            "answer2_rating",
            "answer3_rating",
            "answer4_rating",
        ]
        if not any(key in ratings for key in rating_keys):
            return False

        # Validate ranking consistency (each rank should be unique)
        provided_ranks = []
        for rank_key in rank_keys:
            if rank_key in choices:
                rank_choices = choices[rank_key]["choices"]
                if rank_choices:
                    rank = int(rank_choices[0])
                    if rank in provided_ranks:
                        return False  # Duplicate rank
                    provided_ranks.append(rank)

        return True

label_config property

Return the Label Studio XML configuration for reward bench 2 evaluation (4-way comparison)

process_annotations(annotation_data)

Process annotation data specific to reward bench 2 evaluation (4-way comparison)

Parameters:

Name Type Description Default
annotation_data Dict[str, Any]

Generic annotation data with ratings, choices, text_areas

required

Returns:

Type Description
Dict[str, Any]

Processed data structured for reward bench 2 evaluation

Source code in rm_gallery/gallery/data/annotation/rewardbench2.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def process_annotations(self, annotation_data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Process annotation data specific to reward bench 2 evaluation (4-way comparison)

    Args:
        annotation_data: Generic annotation data with ratings, choices, text_areas

    Returns:
        Processed data structured for reward bench 2 evaluation
    """
    processed = {
        "best_answer": None,
        "answer_rankings": {},
        "answer_ratings": {},
        "ranking_order": [],
        "quality_comparison": {},
        "comments": "",
        "preference": None,
    }

    # Extract best answer selection (Step 1)
    if "best_answer" in annotation_data.get("choices", {}):
        best_answer_choices = annotation_data["choices"]["best_answer"]["choices"]
        if best_answer_choices:
            processed["best_answer"] = best_answer_choices[0]
            processed["preference"] = best_answer_choices[0]

    # Extract answer rankings (Step 2)
    choices = annotation_data.get("choices", {})
    rank_keys = ["answer1_rank", "answer2_rank", "answer3_rank", "answer4_rank"]

    for i, rank_key in enumerate(rank_keys, 1):
        if rank_key in choices:
            rank_choices = choices[rank_key]["choices"]
            if rank_choices:
                processed["answer_rankings"][f"answer_{i}"] = int(rank_choices[0])

    # Create ranking order based on ranks
    if processed["answer_rankings"]:
        # Sort answers by their rank (1=best, 4=worst)
        sorted_answers = sorted(
            processed["answer_rankings"].items(), key=lambda x: x[1]
        )
        processed["ranking_order"] = [answer for answer, rank in sorted_answers]

    # Extract answer ratings (Step 3)
    ratings = annotation_data.get("ratings", {})
    rating_keys = [
        "answer1_rating",
        "answer2_rating",
        "answer3_rating",
        "answer4_rating",
    ]

    for i, rating_key in enumerate(rating_keys, 1):
        if rating_key in ratings:
            processed["answer_ratings"][f"answer_{i}"] = ratings[rating_key][
                "rating"
            ]

    # Calculate quality comparison
    if processed["answer_ratings"]:
        # Find the highest rated answer
        best_rated_answer = max(
            processed["answer_ratings"].items(), key=lambda x: x[1]
        )

        # Calculate average rating
        avg_rating = sum(processed["answer_ratings"].values()) / len(
            processed["answer_ratings"]
        )

        processed["quality_comparison"] = {
            "best_rated_answer": best_rated_answer[0],
            "best_rating": best_rated_answer[1],
            "average_rating": avg_rating,
            "rating_spread": max(processed["answer_ratings"].values())
            - min(processed["answer_ratings"].values()),
            "consistency_check": {
                "best_answer_matches_best_rating": processed["best_answer"]
                == best_rated_answer[0],
                "best_answer_matches_rank_1": processed["best_answer"]
                in [
                    answer
                    for answer, rank in processed["answer_rankings"].items()
                    if rank == 1
                ]
                if processed["answer_rankings"]
                else False,
            },
        }

    # Extract additional comments (Step 4)
    if "additional_comments" in annotation_data.get("text_areas", {}):
        processed["comments"] = annotation_data["text_areas"][
            "additional_comments"
        ]["text"]

    return processed

validate_annotation_data(annotation_data)

Validate annotation data for reward bench 2 evaluation

Parameters:

Name Type Description Default
annotation_data Dict[str, Any]

Annotation data to validate

required

Returns:

Type Description
bool

True if valid, False otherwise

Source code in rm_gallery/gallery/data/annotation/rewardbench2.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def validate_annotation_data(self, annotation_data: Dict[str, Any]) -> bool:
    """
    Validate annotation data for reward bench 2 evaluation

    Args:
        annotation_data: Annotation data to validate

    Returns:
        True if valid, False otherwise
    """
    # Check if required fields are present
    required_sections = ["choices", "ratings"]
    for section in required_sections:
        if section not in annotation_data:
            return False

    # Check if best answer is selected
    if "best_answer" not in annotation_data.get("choices", {}):
        return False

    # Check if at least some rankings are provided
    choices = annotation_data.get("choices", {})
    rank_keys = ["answer1_rank", "answer2_rank", "answer3_rank", "answer4_rank"]
    if not any(key in choices for key in rank_keys):
        return False

    # Check if at least some ratings are provided
    ratings = annotation_data.get("ratings", {})
    rating_keys = [
        "answer1_rating",
        "answer2_rating",
        "answer3_rating",
        "answer4_rating",
    ]
    if not any(key in ratings for key in rating_keys):
        return False

    # Validate ranking consistency (each rank should be unique)
    provided_ranks = []
    for rank_key in rank_keys:
        if rank_key in choices:
            rank_choices = choices[rank_key]["choices"]
            if rank_choices:
                rank = int(rank_choices[0])
                if rank in provided_ranks:
                    return False  # Duplicate rank
                provided_ranks.append(rank)

    return True