# Some code here has been modified from:
# https://huggingface.co/spaces/huggingface/text-data-filtering
# --------------------------------------------------------
from typing import Dict
import regex as re
[docs]
class UnionFind:
[docs]
def __init__(self):
"""Initialization method."""
self.parent: Dict[int, int] = {}
[docs]
def find(self, x):
if x not in self.parent:
self.parent[x] = x
if self.parent[x] != x:
self.parent[x] = self.find(self.parent[x])
return self.parent[x]
[docs]
def union(self, x, y):
px = self.find(x)
py = self.find(y)
self.parent[px] = self.parent[py] = min(px, py)
[docs]
def strip(document, strip_characters):
"""
Way faster than document.strip(strip_characters) since strip_characters is
now a set instead of a str, and it contains a lot of elements (all the
emojis).
:param document: document to be processed
:param strip_characters: characters used for stripping document
:return: stripped document
"""
if not document:
return document
beg_ind = 0
end_ind = len(document)
for i in range(len(document)):
if document[i] in strip_characters:
beg_ind += 1
else:
break
for i in range(1, len(document) + 1):
if document[-i] in strip_characters:
end_ind -= 1
else:
break
document_stripped = document[beg_ind:end_ind]
return document_stripped
[docs]
def split_on_whitespace(document, new_line=False, tab=False):
"""
This method also removes concatenated spaces.
:param document: document to be splited
:param new_line: whether to split document with '\\\\n'
:param tag: whether to split document with '\\\\t'
:return: word list obtained after splitting document
"""
sep = [' '] + new_line * ['\n'] + tab * ['\t']
sep = '|'.join(sep)
split_document = re.split(sep, document)
split_document = [word for word in split_document if word]
return split_document
[docs]
def split_on_newline_tab_whitespace(document):
"""
This method is used to split the document into different levels of sub-
sentences.
First split on "\\\\n", then on "\\\\t", then on " ".
:param document: document to be splited
:return: sentence list obtained after splitting document
"""
sentences = document.split('\n')
sentences = [sentence.split('\t') for sentence in sentences]
sentences = [[
split_on_whitespace(subsentence) for subsentence in sentence
] for sentence in sentences]
return sentences
[docs]
def merge_on_whitespace_tab_newline(sentences):
"""
This method is used to merge different levels of sub-sentences into one
document. Invert the method split_on_newline_tab_whitespace. Removes
concatenated separators.
:param sentences: sentence list to be merged
:return: document obtained after merging sub-sentences
"""
sentences = [[
' '.join(subsentence) for subsentence in sentence if subsentence
] for sentence in sentences]
sentences = ['\t'.join(sentence) for sentence in sentences if sentence]
if not sentences:
return ''
document = '\n'.join(sentences)
return document
[docs]
def words_augmentation(words, group_size, join_char):
"""
Augment words, especially for Chinese (without a space between words) and
Vietnamese (with a space between syllables).
:param word: word list to be augmented
:param group_size: the size of word groups that need to be merged
:param join_char: characters to be added between word group
:return: word list after augment
"""
augmentation = [
join_char.join(words[i:i + group_size])
for i in range(len(words) - group_size + 1)
]
return augmentation
[docs]
def get_words_from_document(
document,
token_func=None,
new_line=True,
tab=True,
):
"""
Get words from a document. Useful to compute ratios, like the
stopwords ratio.
:param document: document that need to split words.
:param token_func: function of tokenizer, if specified, the function
will be used for split document into different tokens.
:param new_line: whether to use '\\\\n' to split words.
:param tab: whether to use '\\\\t' to split words.
:return: word list obtained from document
"""
if token_func:
words = token_func(document)
else:
words = split_on_whitespace(document, new_line, tab)
return words
[docs]
def words_refinement(words,
lower_case=False,
strip_chars=None,
use_words_aug=False,
words_aug_group_sizes=[2],
words_aug_join_char=''):
"""
Refine split words. Non reversible since the document is split on
multiple characters, words are stripped of special characters and
characters are converted to lower case.
:param words: the word list to be augmented
:param lower_case: whether to convert word to lowercase
:param strip_chars: chars that need to be stripped in words
:param use_words_aug: whether to use word augmentation
:param words_aug_group_sizes: the size of word groups that need to
be merged
:param words_aug_join_char: characters to be added between word
group
:return: refined words or word list
"""
if lower_case:
words = [word.lower() for word in words]
if strip_chars:
words = [strip(word, strip_chars) for word in words]
words = [word for word in words if word]
if use_words_aug:
augmentation = [
words_augmentation(words, group_size, words_aug_join_char)
for group_size in words_aug_group_sizes
]
augmentation = [word for augm in augmentation for word in augm]
words = words + augmentation
return words
[docs]
def get_sentences_from_document(document, model_func=None):
"""
Get sentences from a document.
:param document: document that need to split sentences
:param model_func: function of sentence model, if specified, the
function will be used for spliting document into different
sentences.
:return: document with the sentences separated by '\\\\n'
"""
if model_func:
sentences = model_func(document)
else:
sentences = document.splitlines()
return '\n'.join(sentences)
[docs]
def split_text_by_punctuation(text):
"""
Split text by any zh and en punctuation
:param text: text to be splitted.
:return: sub texts splitted by any zh and en punctuation
"""
# any zh and en punctuation
punctuation_pattern = r'[\u3000-\u303f\uff00-\uffef]|[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~]' # noqa: E501
result = re.split(punctuation_pattern, text)
result = [s.strip() for s in result if s.strip()]
return result