[docs]@OPERATORS.register_module(OP_NAME)classFixUnicodeMapper(Mapper):"""Fixes unicode errors in text samples. This operator corrects common unicode errors and normalizes the text to a specified Unicode normalization form. The default normalization form is 'NFC', but it can be set to 'NFKC', 'NFD', or 'NFKD' during initialization. It processes text samples in batches, applying the specified normalization to each sample. If an unsupported normalization form is provided, a ValueError is raised."""_batched_op=True
[docs]def__init__(self,normalization:str=None,*args,**kwargs):""" Initialization method. :param normalization: the specified form of Unicode normalization mode, which can be one of ['NFC', 'NFKC', 'NFD', and 'NFKD'], default 'NFC'. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)ifnormalizationandlen(normalization)>0:self.normalization=normalization.upper()else:self.normalization="NFC"ifself.normalization.upper()notin["NFC","NFKC","NFD","NFKD"]:raiseValueError(f"Normalization mode [{normalization}] is not ""supported. Can only be one of "'["NFC", "NFKC", "NFD", "NFKD"]')