Source code for data_juicer.ops.mapper.query_intent_detection_mapper
from typing import Dict, Optional
from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.model_utils import get_model, prepare_model
from ..base_op import OPERATORS, TAGGING_OPS, Mapper
OP_NAME = "query_intent_detection_mapper"
[docs]
@TAGGING_OPS.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class QueryIntentDetectionMapper(Mapper):
"""Predicts the user's intent label and corresponding score for a given query. The operator
uses a Hugging Face model to classify the intent of the input query. If the query is in
Chinese, it can optionally be translated to English using another Hugging Face
translation model before classification. The predicted intent label and its confidence
score are stored in the meta field with the keys 'query_intent_label' and
'query_intent_score', respectively. If these keys already exist in the meta field, the
operator will skip processing for those samples."""
_accelerator = "cuda"
_batched_op = True
[docs]
def __init__(
self,
hf_model: str = "bespin-global/klue-roberta-small-3i4k-intent-classification", # noqa: E501 E131
zh_to_en_hf_model: Optional[str] = "Helsinki-NLP/opus-mt-zh-en",
model_params: Dict = {},
zh_to_en_model_params: Dict = {},
*,
label_key: str = MetaKeys.query_intent_label,
score_key: str = MetaKeys.query_intent_score,
**kwargs,
):
"""
Initialization method.
:param hf_model: Huggingface model ID to predict intent label.
:param zh_to_en_hf_model: Translation model from Chinese to English.
If not None, translate the query from Chinese to English.
:param model_params: model param for hf_model.
:param zh_to_en_model_params: model param for zh_to_hf_model.
:param label_key: The key name in the meta field to store the
output label. It is 'query_intent_label' in default.
:param score_key: The key name in the meta field to store the
corresponding label score. It is 'query_intent_label_score'
in default.
:param kwargs: Extra keyword arguments.
"""
super().__init__(**kwargs)
self.label_key = label_key
self.score_key = score_key
self.model_key = prepare_model(
model_type="huggingface",
pretrained_model_name_or_path=hf_model,
return_pipe=True,
pipe_task="text-classification",
**model_params,
)
if zh_to_en_hf_model is not None:
self.zh_to_en_model_key = prepare_model(
model_type="huggingface",
pretrained_model_name_or_path=zh_to_en_hf_model,
return_pipe=True,
pipe_task="translation",
**zh_to_en_model_params,
)
else:
self.zh_to_en_model_key = None
[docs]
def process_batched(self, samples, rank=None):
metas = samples[Fields.meta]
if self.label_key in metas[0] and self.score_key in metas[0]:
return samples
queries = samples[self.query_key]
if self.zh_to_en_model_key is not None:
translator, _ = get_model(self.zh_to_en_model_key, rank, self.use_cuda())
results = translator(queries)
queries = [item["translation_text"] for item in results]
classifier, _ = get_model(self.model_key, rank, self.use_cuda())
results = classifier(queries)
labels = [r["label"] for r in results]
scores = [r["score"] for r in results]
for i in range(len(metas)):
metas[i][self.label_key] = labels[i]
metas[i][self.score_key] = scores[i]
return samples