Source code for data_juicer.ops.common.helper_func

# Some code here has been modified from:
# https://huggingface.co/spaces/huggingface/text-data-filtering
# --------------------------------------------------------
from typing import Dict

import regex as re


class UnionFind:

    def __init__(self):
        """Initialization method."""
        self.parent: Dict[int, int] = {}

    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        px = self.find(x)
        py = self.find(y)
        self.parent[px] = self.parent[py] = min(px, py)


[docs]def strip(document, strip_characters): """ Way faster than document.strip(strip_characters) since strip_characters is now a set instead of a str, and it contains a lot of elements (all the emojis). :param document: document to be processed :param strip_characters: characters used for stripping document :return: stripped document """ if not document: return document beg_ind = 0 end_ind = len(document) for i in range(len(document)): if document[i] in strip_characters: beg_ind += 1 else: break for i in range(1, len(document) + 1): if document[-i] in strip_characters: end_ind -= 1 else: break document_stripped = document[beg_ind:end_ind] return document_stripped
[docs]def split_on_whitespace(document, new_line=False, tab=False): """ This method also removes concatenated spaces. :param document: document to be splited :param new_line: whether to split document with '\\\\n' :param tag: whether to split document with '\\\\t' :return: word list obtained after splitting document """ sep = [' '] + new_line * ['\n'] + tab * ['\t'] sep = '|'.join(sep) split_document = re.split(sep, document) split_document = [word for word in split_document if word] return split_document
[docs]def split_on_newline_tab_whitespace(document): """ This method is used to split the document into different levels of sub- sentences. First split on "\\\\n", then on "\\\\t", then on " ". :param document: document to be splited :return: sentence list obtained after splitting document """ sentences = document.split('\n') sentences = [sentence.split('\t') for sentence in sentences] sentences = [[ split_on_whitespace(subsentence) for subsentence in sentence ] for sentence in sentences] return sentences
[docs]def merge_on_whitespace_tab_newline(sentences): """ This method is used to merge different levels of sub-sentences into one document. Invert the method split_on_newline_tab_whitespace. Removes concatenated separators. :param sentences: sentence list to be merged :return: document obtained after merging sub-sentences """ sentences = [[ ' '.join(subsentence) for subsentence in sentence if subsentence ] for sentence in sentences] sentences = ['\t'.join(sentence) for sentence in sentences if sentence] if not sentences: return '' document = '\n'.join(sentences) return document
[docs]def words_augmentation(words, group_size, join_char): """ Augment words, especially for Chinese (without a space between words) and Vietnamese (with a space between syllables). :param word: word list to be augmented :param group_size: the size of word groups that need to be merged :param join_char: characters to be added between word group :return: word list after augment """ augmentation = [ join_char.join(words[i:i + group_size]) for i in range(len(words) - group_size + 1) ] return augmentation
[docs]def get_words_from_document( document, token_func=None, new_line=True, tab=True, ): """ Get words from a document. Useful to compute ratios, like the stopwords ratio. :param document: document that need to split words. :param token_func: function of tokenizer, if specified, the function will be used for split document into different tokens. :param new_line: whether to use '\\\\n' to split words. :param tab: whether to use '\\\\t' to split words. :return: word list obtained from document """ if token_func: words = token_func(document) else: words = split_on_whitespace(document, new_line, tab) return words
[docs]def words_refinement(words, lower_case=False, strip_chars=None, use_words_aug=False, words_aug_group_sizes=[2], words_aug_join_char=''): """ Refine split words. Non reversible since the document is split on multiple characters, words are stripped of special characters and characters are converted to lower case. :param words: the word list to be augmented :param lower_case: whether to convert word to lowercase :param strip_chars: chars that need to be stripped in words :param use_words_aug: whether to use word augmentation :param words_aug_group_sizes: the size of word groups that need to be merged :param words_aug_join_char: characters to be added between word group :return: refined words or word list """ if lower_case: words = [word.lower() for word in words] if strip_chars: words = [strip(word, strip_chars) for word in words] words = [word for word in words if word] if use_words_aug: augmentation = [ words_augmentation(words, group_size, words_aug_join_char) for group_size in words_aug_group_sizes ] augmentation = [word for augm in augmentation for word in augm] words = words + augmentation return words
[docs]def get_sentences_from_document(document, model_func=None): """ Get sentences from a document. :param document: document that need to split sentences :param model_func: function of sentence model, if specified, the function will be used for spliting document into different sentences. :return: document with the sentences separated by '\\\\n' """ if model_func: sentences = model_func(document) else: sentences = document.splitlines() return '\n'.join(sentences)
[docs]def split_text_by_punctuation(text): """ Split text by any zh and en punctuation :param text: text to be splitted. :return: sub texts splitted by any zh and en punctuation """ # any zh and en punctuation punctuation_pattern = r'[\u3000-\u303f\uff00-\uffef]|[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~]' # noqa: E501 result = re.split(punctuation_pattern, text) result = [s.strip() for s in result if s.strip()] return result