data_juicer.ops.mapper.chinese_convert_mapper 源代码

from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import OPERATORS, Mapper

opencc = LazyLoader("opencc")

OP_NAME = "chinese_convert_mapper"

OPENCC_CONVERTER = None


[文档] def prepare_converter(mode): mode_path = mode + ".json" global OPENCC_CONVERTER if OPENCC_CONVERTER is None: # empty converter OPENCC_CONVERTER = opencc.OpenCC(mode_path) if not OPENCC_CONVERTER.config.endswith(mode_path): # the config is actually a config path # update and get a new converter with specified mode OPENCC_CONVERTER = opencc.OpenCC(mode_path)
[文档] @OPERATORS.register_module(OP_NAME) class ChineseConvertMapper(Mapper): """Mapper to convert Chinese text between Traditional, Simplified, and Japanese Kanji. This operator converts Chinese text based on the specified mode. It supports conversions between Simplified Chinese, Traditional Chinese (including Taiwan and Hong Kong variants), and Japanese Kanji. The conversion is performed using a pre-defined set of rules. The available modes include 's2t' for Simplified to Traditional, 't2s' for Traditional to Simplified, and other specific variants like 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', and 'jp2t'. The operator processes text in batches and applies the conversion to the specified text key in the samples.""" _batched_op = True
[文档] def __init__(self, mode: str = "s2t", *args, **kwargs): """ Initialization method. :param mode: Choose the mode to convert Chinese: s2t: Simplified Chinese to Traditional Chinese, t2s: Traditional Chinese to Simplified Chinese, s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard), tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese, s2hk: Simplified Chinese to Traditional Chinese (Hong Kong variant), hk2s: Traditional Chinese (Hong Kong variant) to Simplified Chinese, s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) with Taiwanese idiom, tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese with Mainland Chinese idiom, t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard), tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese, hk2t: Traditional Chinese (Hong Kong variant) to Traditional Chinese, t2hk: Traditional Chinese to Traditional Chinese (Hong Kong variant), t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji, jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese Characters, :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) mode_list = [ "s2t", "t2s", "s2tw", "tw2s", "s2hk", "hk2s", "s2twp", "tw2sp", "t2tw", "tw2t", "hk2t", "t2hk", "t2jp", "jp2t", ] assert mode in mode_list, "Please make sure mode is one of {}".format(mode_list) self.mode = mode prepare_converter(self.mode)
[文档] def process_batched(self, samples): prepare_converter(self.mode) samples[self.text_key] = [OPENCC_CONVERTER.convert(text) for text in samples[self.text_key]] return samples