Source code for data_juicer.ops.mapper.query_intent_detection_mapper

from typing import Dict, Optional

from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, TAGGING_OPS, Mapper

OP_NAME = "query_intent_detection_mapper"


[docs] @TAGGING_OPS.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class QueryIntentDetectionMapper(Mapper): """ Mapper to predict user's Intent label in query. Input from query_key. Output intent label and corresponding score for the query. """ _accelerator = "cuda" _batched_op = True
[docs] def __init__( self, hf_model: str = "bespin-global/klue-roberta-small-3i4k-intent-classification", # noqa: E501 E131 zh_to_en_hf_model: Optional[str] = "Helsinki-NLP/opus-mt-zh-en", model_params: Dict = {}, zh_to_en_model_params: Dict = {}, *, label_key: str = MetaKeys.query_intent_label, score_key: str = MetaKeys.query_intent_score, **kwargs, ): """ Initialization method. :param hf_model: Huggingface model ID to predict intent label. :param zh_to_en_hf_model: Translation model from Chinese to English. If not None, translate the query from Chinese to English. :param model_params: model param for hf_model. :param zh_to_en_model_params: model param for zh_to_hf_model. :param label_key: The key name in the meta field to store the output label. It is 'query_intent_label' in default. :param score_key: The key name in the meta field to store the corresponding label score. It is 'query_intent_label_score' in default. :param kwargs: Extra keyword arguments. """ super().__init__(**kwargs) self.label_key = label_key self.score_key = score_key self.model_key = prepare_model( model_type="huggingface", pretrained_model_name_or_path=hf_model, return_pipe=True, pipe_task="text-classification", **model_params, ) if zh_to_en_hf_model is not None: self.zh_to_en_model_key = prepare_model( model_type="huggingface", pretrained_model_name_or_path=zh_to_en_hf_model, return_pipe=True, pipe_task="translation", **zh_to_en_model_params, ) else: self.zh_to_en_model_key = None
[docs] def process_batched(self, samples, rank=None): metas = samples[Fields.meta] if self.label_key in metas[0] and self.score_key in metas[0]: return samples queries = samples[self.query_key] if self.zh_to_en_model_key is not None: translator, _ = get_model(self.zh_to_en_model_key, rank, self.use_cuda()) results = translator(queries) queries = [item["translation_text"] for item in results] classifier, _ = get_model(self.model_key, rank, self.use_cuda()) results = classifier(queries) labels = [r["label"] for r in results] scores = [r["score"] for r in results] for i in range(len(metas)): metas[i][self.label_key] = labels[i] metas[i][self.score_key] = scores[i] return samples