Skip to content

helpsteer2_pointwise

HelpSteer2PointwiseConverter

Bases: DataConverter

Unified converter for HelpSteer2 data format Can handle data from both local files and HuggingFace Hub

Source code in rm_gallery/gallery/data/load/helpsteer2_pointwise.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
@DataConverterRegistry.register("helpsteer2_pointwise")
class HelpSteer2PointwiseConverter(DataConverter):
    """
    Unified converter for HelpSteer2 data format
    Can handle data from both local files and HuggingFace Hub
    """

    def convert_to_data_sample(
        self, data_dict: Dict[str, Any], source_info: Dict[str, Any]
    ) -> Union[DataSample, List[DataSample]]:
        """Convert HelpSteer2 data to DataSample format"""
        # Generate unique id
        content = str(data_dict)
        unique_id = hashlib.md5(content.encode()).hexdigest()

        try:
            # Create input from prompt
            data_input = [ChatMessage(role="user", content=data_dict["prompt"])]

            # Extract evaluation metrics for label
            label = {
                "helpfulness": data_dict.get("helpfulness"),
                "correctness": data_dict.get("correctness"),
                "coherence": data_dict.get("coherence"),
                "complexity": data_dict.get("complexity"),
                "verbosity": data_dict.get("verbosity"),
            }

            # Create output from response
            data_output = [
                DataOutput(
                    answer=Step(
                        role="assistant", content=data_dict["response"], label=label
                    )
                )
            ]

            # Build metadata based on source type
            metadata = {
                "raw_data": data_dict,
                "load_strategy": "HelpSteer2Converter",
            }

            # Add source-specific metadata
            if source_info.get("load_type") == "local":
                metadata.update(
                    {
                        "source_file_path": source_info.get("source_file_path"),
                        "load_type": "local",
                    }
                )
            elif source_info.get("load_type") == "huggingface":
                metadata.update(
                    {
                        "dataset_name": source_info.get(
                            "dataset_name", "nvidia/HelpSteer2"
                        ),
                        "dataset_config": source_info.get("dataset_config"),
                        "split": source_info.get("split", "train"),
                        "load_type": "huggingface",
                    }
                )

            data_sample = DataSample(
                unique_id=unique_id,
                input=data_input,
                output=data_output,
                source="helpsteer2",
                task_category="chat",
                metadata=metadata,
            )

            return [data_sample]

        except Exception as e:
            logger.error(f"Error creating HelpSteer2 DataSample: {str(e)}")
            return None

convert_to_data_sample(data_dict, source_info)

Convert HelpSteer2 data to DataSample format

Source code in rm_gallery/gallery/data/load/helpsteer2_pointwise.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def convert_to_data_sample(
    self, data_dict: Dict[str, Any], source_info: Dict[str, Any]
) -> Union[DataSample, List[DataSample]]:
    """Convert HelpSteer2 data to DataSample format"""
    # Generate unique id
    content = str(data_dict)
    unique_id = hashlib.md5(content.encode()).hexdigest()

    try:
        # Create input from prompt
        data_input = [ChatMessage(role="user", content=data_dict["prompt"])]

        # Extract evaluation metrics for label
        label = {
            "helpfulness": data_dict.get("helpfulness"),
            "correctness": data_dict.get("correctness"),
            "coherence": data_dict.get("coherence"),
            "complexity": data_dict.get("complexity"),
            "verbosity": data_dict.get("verbosity"),
        }

        # Create output from response
        data_output = [
            DataOutput(
                answer=Step(
                    role="assistant", content=data_dict["response"], label=label
                )
            )
        ]

        # Build metadata based on source type
        metadata = {
            "raw_data": data_dict,
            "load_strategy": "HelpSteer2Converter",
        }

        # Add source-specific metadata
        if source_info.get("load_type") == "local":
            metadata.update(
                {
                    "source_file_path": source_info.get("source_file_path"),
                    "load_type": "local",
                }
            )
        elif source_info.get("load_type") == "huggingface":
            metadata.update(
                {
                    "dataset_name": source_info.get(
                        "dataset_name", "nvidia/HelpSteer2"
                    ),
                    "dataset_config": source_info.get("dataset_config"),
                    "split": source_info.get("split", "train"),
                    "load_type": "huggingface",
                }
            )

        data_sample = DataSample(
            unique_id=unique_id,
            input=data_input,
            output=data_output,
            source="helpsteer2",
            task_category="chat",
            metadata=metadata,
        )

        return [data_sample]

    except Exception as e:
        logger.error(f"Error creating HelpSteer2 DataSample: {str(e)}")
        return None