Source code for data_juicer.ops.common.helper_func
# Some code here has been modified from:# https://huggingface.co/spaces/huggingface/text-data-filtering# --------------------------------------------------------fromtypingimportDictimportregexasre
[docs]defstrip(document,strip_characters):""" Way faster than document.strip(strip_characters) since strip_characters is now a set instead of a str, and it contains a lot of elements (all the emojis). :param document: document to be processed :param strip_characters: characters used for stripping document :return: stripped document """ifnotdocument:returndocumentbeg_ind=0end_ind=len(document)foriinrange(len(document)):ifdocument[i]instrip_characters:beg_ind+=1else:breakforiinrange(1,len(document)+1):ifdocument[-i]instrip_characters:end_ind-=1else:breakdocument_stripped=document[beg_ind:end_ind]returndocument_stripped
[docs]defsplit_on_whitespace(document,new_line=False,tab=False):""" This method also removes concatenated spaces. :param document: document to be split :param new_line: whether to split document with '\\\\n' :param tag: whether to split document with '\\\\t' :return: word list obtained after splitting document """sep=[' ']+new_line*['\n']+tab*['\t']sep='|'.join(sep)split_document=re.split(sep,document)split_document=[wordforwordinsplit_documentifword]returnsplit_document
[docs]defsplit_on_newline_tab_whitespace(document):""" This method is used to split the document into different levels of sub- sentences. First split on "\\\\n", then on "\\\\t", then on " ". :param document: document to be split :return: sentence list obtained after splitting document """sentences=document.split('\n')sentences=[sentence.split('\t')forsentenceinsentences]sentences=[[split_on_whitespace(subsentence)forsubsentenceinsentence]forsentenceinsentences]returnsentences
[docs]defmerge_on_whitespace_tab_newline(sentences):""" This method is used to merge different levels of sub-sentences into one document. Invert the method split_on_newline_tab_whitespace. Removes concatenated separators. :param sentences: sentence list to be merged :return: document obtained after merging sub-sentences """sentences=[[' '.join(subsentence)forsubsentenceinsentenceifsubsentence]forsentenceinsentences]sentences=['\t'.join(sentence)forsentenceinsentencesifsentence]ifnotsentences:return''document='\n'.join(sentences)returndocument
[docs]defwords_augmentation(words,group_size,join_char):""" Augment words, especially for Chinese (without a space between words) and Vietnamese (with a space between syllables). :param word: word list to be augmented :param group_size: the size of word groups that need to be merged :param join_char: characters to be added between word group :return: word list after augment """augmentation=[join_char.join(words[i:i+group_size])foriinrange(len(words)-group_size+1)]returnaugmentation
[docs]defget_words_from_document(document,token_func=None,new_line=True,tab=True,):""" Get words from a document. Useful to compute ratios, like the stopwords ratio. :param document: document that need to split words. :param token_func: function of tokenizer, if specified, the function will be used for split document into different tokens. :param new_line: whether to use '\\\\n' to split words. :param tab: whether to use '\\\\t' to split words. :return: word list obtained from document """iftoken_func:words=token_func(document)else:words=split_on_whitespace(document,new_line,tab)returnwords
[docs]defwords_refinement(words,lower_case=False,strip_chars=None,use_words_aug=False,words_aug_group_sizes=[2],words_aug_join_char=''):""" Refine split words. Non reversible since the document is split on multiple characters, words are stripped of special characters and characters are converted to lower case. :param words: the word list to be augmented :param lower_case: whether to convert word to lowercase :param strip_chars: chars that need to be stripped in words :param use_words_aug: whether to use word augmentation :param words_aug_group_sizes: the size of word groups that need to be merged :param words_aug_join_char: characters to be added between word group :return: refined words or word list """iflower_case:words=[word.lower()forwordinwords]ifstrip_chars:words=[strip(word,strip_chars)forwordinwords]words=[wordforwordinwordsifword]ifuse_words_aug:augmentation=[words_augmentation(words,group_size,words_aug_join_char)forgroup_sizeinwords_aug_group_sizes]augmentation=[wordforaugminaugmentationforwordinaugm]words=words+augmentationreturnwords
[docs]defget_sentences_from_document(document,model_func=None):""" Get sentences from a document. :param document: document that need to split sentences :param model_func: function of sentence model, if specified, the function will be used for splitting document into different sentences. :return: document with the sentences separated by '\\\\n' """ifmodel_func:sentences=model_func(document)else:sentences=document.splitlines()return'\n'.join(sentences)
[docs]defsplit_text_by_punctuation(text):""" Split text by any zh and en punctuation :param text: text to be split. :return: sub texts split by any zh and en punctuation """# any zh and en punctuationpunctuation_pattern=r'[\u3000-\u303f\uff00-\uffef]|[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~]'# noqa: E501result=re.split(punctuation_pattern,text)result=[s.strip()forsinresultifs.strip()]ifnotresult:return[text]returnresult