Source code for data_juicer.ops.mapper.extract_nickname_mapper

import re
from typing import Dict, Optional

from loguru import logger
from pydantic import PositiveInt

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.model_utils import get_model, prepare_model

OP_NAME = 'extract_nickname_mapper'


# TODO: LLM-based inference.
[docs]@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class ExtractNicknameMapper(Mapper): """ Extract nickname relationship in the text. """ DEFAULT_SYSTEM_PROMPT = ('给定你一段文本,你的任务是将人物之间的称呼方式(昵称)提取出来。\n' '要求:\n' '- 需要给出说话人对被称呼人的称呼,不要搞反了。\n' '- 相同的说话人和被称呼人最多给出一个最常用的称呼。\n' '- 请不要输出互相没有昵称的称呼方式。\n' '- 输出格式如下:\n' '```\n' '### 称呼方式1\n' '- **说话人**:...\n' '- **被称呼人**:...\n' '- **...对...的昵称**:...\n' '### 称呼方式2\n' '- **说话人**:...\n' '- **被称呼人**:...\n' '- **...对...的昵称**:...\n' '### 称呼方式3\n' '- **说话人**:...\n' '- **被称呼人**:...\n' '- **...对...的昵称**:...\n' '...\n' '```\n') DEFAULT_INPUT_TEMPLATE = '# 文本\n```\n{text}\n```\n' DEFAULT_OUTPUT_PATTERN = r""" \#\#\#\s*称呼方式(\d+)\s* -\s*\*\*说话人\*\*\s*:\s*(.*?)\s* -\s*\*\*被称呼人\*\*\s*:\s*(.*?)\s* -\s*\*\*(.*?)对(.*?)的昵称\*\*\s*:\s*(.*?)(?=\#\#\#|\Z) # for double check """
[docs] def __init__(self, api_model: str = 'gpt-4o', *, nickname_key: str = Fields.nickname, api_endpoint: Optional[str] = None, response_path: Optional[str] = None, system_prompt: Optional[str] = None, input_template: Optional[str] = None, output_pattern: Optional[str] = None, try_num: PositiveInt = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs): """ Initialization method. :param api_model: API model name. :param nickname_key: The field name to store the nickname relationship. It's "__dj__nickname__" in default. :param api_endpoint: URL endpoint for the API. :param response_path: Path to extract content from the API response. Defaults to 'choices.0.message.content'. :param system_prompt: System prompt for the task. :param input_template: Template for building the model input. :param output_pattern: Regular expression for parsing model output. :param try_num: The number of retry attempts when there is an API call error or output parsing error. :param drop_text: If drop the text in the output. :param model_params: Parameters for initializing the API model. :param sampling_params: Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95} :param kwargs: Extra keyword arguments. """ super().__init__(**kwargs) self.nickname_key = nickname_key self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN self.sampling_params = sampling_params self.model_key = prepare_model(model_type='api', model=api_model, endpoint=api_endpoint, response_path=response_path, **model_params) self.try_num = try_num self.drop_text = drop_text
[docs] def parse_output(self, raw_output): pattern = re.compile(self.output_pattern, re.VERBOSE | re.DOTALL) matches = pattern.findall(raw_output) nickname_relations = [] for match in matches: _, role1, role2, role1_tmp, role2_tmp, nickname = match # for double check if role1.strip() != role1_tmp.strip() or role2.strip( ) != role2_tmp.strip(): continue role1 = role1.strip() role2 = role2.strip() nickname = nickname.strip() # is name but not nickname if role2 == nickname: continue if role1 and role2 and nickname: nickname_relations.append((role1, role2, nickname)) nickname_relations = list(set(nickname_relations)) nickname_relations = [{ Fields.source_entity: nr[0], Fields.target_entity: nr[1], Fields.relation_description: nr[2], Fields.relation_keywords: ['nickname'], Fields.relation_strength: None } for nr in nickname_relations] return nickname_relations
[docs] def process_single(self, sample, rank=None): client = get_model(self.model_key, rank=rank) input_prompt = self.input_template.format(text=sample[self.text_key]) messages = [{ 'role': 'system', 'content': self.system_prompt }, { 'role': 'user', 'content': input_prompt }] nickname_relations = [] for i in range(self.try_num): try: output = client(messages, **self.sampling_params) nickname_relations = self.parse_output(output) if len(nickname_relations) > 0: break except Exception as e: logger.warning(f'Exception: {e}') sample[self.nickname_key] = nickname_relations if self.drop_text: sample.pop(self.text_key) return sample