Source code for data_juicer.ops.mapper.pair_preference_mapper
importrefromtypingimportDict,OptionalfromloguruimportloggerfrompydanticimportPositiveIntfromdata_juicer.ops.base_opimportOPERATORS,Mapperfromdata_juicer.utils.model_utilsimportget_model,prepare_modelOP_NAME="pair_preference_mapper"# TODO: Extend LLM-based OPs into API-based implementation.
[docs]@OPERATORS.register_module(OP_NAME)classPairPreferenceMapper(Mapper):""" Mapper to construct paired preference samples. """# avoid leading whitespaceDEFAULT_SYSTEM_PROMPT=("你的任务是根据参考信息修改问答对中的回答,在语言风格、事实性、人物身份、立场等任一方面与原回答相反。""必须按照以下标记格式输出,不要输出其他多余内容。\n""【回答】\n""生成的新回答\n""【原因】\n""生成该回答的原因")DEFAULT_INPUT_TEMPLATE=("【参考信息】\n""{reference}\n""\n""以下是原始问答对:\n""【问题】\n""{query}\n""【回答】\n""{response}")DEFAULT_OUTPUT_PATTERN=r".*?【回答】\s*(.*?)\s*【原因】\s*(.*)"
[docs]def__init__(self,api_model:str="gpt-4o",*,api_endpoint:Optional[str]=None,response_path:Optional[str]=None,system_prompt:Optional[str]=None,input_template:Optional[str]=None,output_pattern:Optional[str]=None,rejected_key:str="rejected_response",reason_key:str="reason",try_num:PositiveInt=3,model_params:Dict={},sampling_params:Dict={},**kwargs,):""" Initialization method. :param api_model: API model name. :param api_endpoint: URL endpoint for the API. :param response_path: Path to extract content from the API response. Defaults to 'choices.0.message.content'. :param system_prompt: System prompt for guiding the generation task. :param input_template: Template for building the model input. It must contain placeholders '{query}' and '{response}', and can optionally include '{reference}'. :param output_pattern: Regular expression for parsing model output. :param rejected_key: The field name in the sample to store the generated rejected response. Defaults to 'rejected_response'. :param reason_key: The field name in the sample to store the reason for generating the response. Defaults to 'reason'. :param try_num: The number of retries for the API call in case of response parsing failure. Defaults to 3. :param model_params: Parameters for initializing the API model. :param sampling_params: Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95} :param kwargs: Extra keyword arguments. """super().__init__(**kwargs)self.system_prompt=system_promptorself.DEFAULT_SYSTEM_PROMPTself.input_template=input_templateorself.DEFAULT_INPUT_TEMPLATEself.output_pattern=output_patternorself.DEFAULT_OUTPUT_PATTERNself.rejected_key=rejected_keyself.reason_key=reason_keyself.model_key=prepare_model(model_type="api",model=api_model,endpoint=api_endpoint,response_path=response_path,**model_params)self.try_num=try_numself.sampling_params=sampling_params