Source code for data_juicer.ops.mapper.query_sentiment_detection_mapper

from typing import Dict, Optional

from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, TAGGING_OPS, Mapper

OP_NAME = 'query_sentiment_detection_mapper'


[docs] @TAGGING_OPS.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class QuerySentimentDetectionMapper(Mapper): """ Mapper to predict user's sentiment label ('negative', 'neutral' and 'positive') in query. Input from query_key. Output label and corresponding score for the query, which is store in 'query_sentiment_label' and 'query_sentiment_label_score' in Data-Juicer meta field. """ _accelerator = 'cuda' _batched_op = True
[docs] def __init__( self, hf_model: str = 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis', # noqa: E501 E131 zh_to_en_hf_model: Optional[str] = 'Helsinki-NLP/opus-mt-zh-en', model_params: Dict = {}, zh_to_en_model_params: Dict = {}, *, label_key: str = MetaKeys.query_sentiment_label, score_key: str = MetaKeys.query_sentiment_score, **kwargs): """ Initialization method. :param hf_model: Hugginface model ID to predict sentiment label. :param zh_to_en_hf_model: Translation model from Chinese to English. If not None, translate the query from Chinese to English. :param model_params: model param for hf_model. :param zh_to_en_model_params: model param for zh_to_hf_model. :param label_key: The key name in the meta field to store the output label. It is 'query_sentiment_label' in default. :param score_key: The key name in the meta field to store the corresponding label score. It is 'query_sentiment_label_score' in default. :param kwargs: Extra keyword arguments. """ super().__init__(**kwargs) self.label_key = label_key self.score_key = score_key self.model_key = prepare_model(model_type='huggingface', pretrained_model_name_or_path=hf_model, return_pipe=True, pipe_task='text-classification', **model_params) if zh_to_en_hf_model is not None: self.zh_to_en_model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=zh_to_en_hf_model, return_pipe=True, pipe_task='translation', **zh_to_en_model_params) else: self.zh_to_en_model_key = None
[docs] def process_batched(self, samples, rank=None): metas = samples[Fields.meta] if self.label_key in metas[0] and self.score_key in metas[0]: return samples queries = samples[self.query_key] if self.zh_to_en_model_key is not None: translater, _ = get_model(self.zh_to_en_model_key, rank, self.use_cuda()) results = translater(queries) queries = [item['translation_text'] for item in results] classifier, _ = get_model(self.model_key, rank, self.use_cuda()) results = classifier(queries) labels = [r['label'] for r in results] scores = [r['score'] for r in results] for i in range(len(metas)): metas[i][self.label_key] = labels[i] metas[i][self.score_key] = scores[i] return samples