Source code for data_juicer.ops.mapper.chinese_convert_mapper

from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import OPERATORS, Mapper

opencc = LazyLoader('opencc', 'opencc')

OP_NAME = 'chinese_convert_mapper'

OPENCC_CONVERTER = None


[docs] def prepare_converter(mode): mode_path = mode + '.json' global OPENCC_CONVERTER if OPENCC_CONVERTER is None: # empty converter OPENCC_CONVERTER = opencc.OpenCC(mode_path) if not OPENCC_CONVERTER.config.endswith(mode_path): # the config is actually a config path # update and get a new converter with specified mode OPENCC_CONVERTER = opencc.OpenCC(mode_path)
[docs] @OPERATORS.register_module(OP_NAME) class ChineseConvertMapper(Mapper): """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.""" _batched_op = True
[docs] def __init__(self, mode: str = 's2t', *args, **kwargs): """ Initialization method. :param mode: Choose the mode to convert Chinese: s2t: Simplified Chinese to Traditional Chinese, t2s: Traditional Chinese to Simplified Chinese, s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard), tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese, s2hk: Simplified Chinese to Traditional Chinese (Hong Kong variant), hk2s: Traditional Chinese (Hong Kong variant) to Simplified Chinese, s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) with Taiwanese idiom, tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese with Mainland Chinese idiom, t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard), tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese, hk2t: Traditional Chinese (Hong Kong variant) to Traditional Chinese, t2hk: Traditional Chinese to Traditional Chinese (Hong Kong variant), t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji, jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese Characters, :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) mode_list = [ 's2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t' ] assert mode in mode_list, 'Please make sure mode is one of {}'.format( mode_list) self.mode = mode prepare_converter(self.mode)
[docs] def process_batched(self, samples): prepare_converter(self.mode) samples[self.text_key] = [ OPENCC_CONVERTER.convert(text) for text in samples[self.text_key] ] return samples