data_juicer
API Reference
data_juicer.core package
data_juicer.ops package
data_juicer.ops.filter package
data_juicer.ops.mapper package
data_juicer.ops.deduplicator package
data_juicer.ops.selector package
data_juicer.ops.common package
data_juicer.analysis package
data_juicer.config package
data_juicer.format package
data_juicer
Index
Index
_
|
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
J
|
K
|
L
|
M
|
N
|
O
|
P
|
Q
|
R
|
S
|
T
|
U
|
V
|
W
|
Z
_
__init__() (data_juicer.analysis.collector.TextTokenDistCollector method)
(data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.diversity_analysis.DiversityAnalysis method)
(data_juicer.analysis.DiversityAnalysis method)
(data_juicer.analysis.overall_analysis.OverallAnalysis method)
(data_juicer.analysis.OverallAnalysis method)
(data_juicer.core.Adapter method)
(data_juicer.core.adapter.Adapter method)
(data_juicer.core.Analyzer method)
(data_juicer.core.analyzer.Analyzer method)
(data_juicer.core.data.NestedDataset method)
(data_juicer.core.data.NestedDatasetDict method)
(data_juicer.core.data.NestedQueryDict method)
(data_juicer.core.Executor method)
(data_juicer.core.executor.Executor method)
(data_juicer.core.Exporter method)
(data_juicer.core.exporter.Exporter method)
(data_juicer.core.Monitor method)
(data_juicer.core.monitor.Monitor method)
(data_juicer.core.NestedDataset method)
(data_juicer.core.ray_data.RayDataset method)
(data_juicer.core.ray_executor.RayExecutor method)
(data_juicer.core.Tracer method)
(data_juicer.core.tracer.Tracer method)
(data_juicer.format.csv_formatter.CsvFormatter method)
(data_juicer.format.CsvFormatter method)
(data_juicer.format.empty_formatter.EmptyFormatter method)
(data_juicer.format.empty_formatter.RayEmptyFormatter method)
(data_juicer.format.EmptyFormatter method)
(data_juicer.format.formatter.LocalFormatter method)
(data_juicer.format.formatter.RemoteFormatter method)
(data_juicer.format.json_formatter.JsonFormatter method)
(data_juicer.format.JsonFormatter method)
(data_juicer.format.LocalFormatter method)
(data_juicer.format.mixture_formatter.MixtureFormatter method)
(data_juicer.format.MixtureFormatter method)
(data_juicer.format.parquet_formatter.ParquetFormatter method)
(data_juicer.format.ParquetFormatter method)
(data_juicer.format.RayEmptyFormatter method)
(data_juicer.format.RemoteFormatter method)
(data_juicer.format.text_formatter.TextFormatter method)
(data_juicer.format.TextFormatter method)
(data_juicer.format.tsv_formatter.TsvFormatter method)
(data_juicer.format.TsvFormatter method)
(data_juicer.ops.Aggregator method)
(data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.most_relavant_entities_aggregator.MostRelavantEntitiesAggregator method)
(data_juicer.ops.aggregator.MostRelavantEntitiesAggregator method)
(data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method)
(data_juicer.ops.aggregator.NestedAggregator method)
(data_juicer.ops.base_op.Aggregator method)
(data_juicer.ops.base_op.Deduplicator method)
(data_juicer.ops.base_op.Filter method)
(data_juicer.ops.base_op.Grouper method)
(data_juicer.ops.base_op.Mapper method)
(data_juicer.ops.base_op.OP method)
(data_juicer.ops.base_op.Selector method)
(data_juicer.ops.common.helper_func.UnionFind method)
(data_juicer.ops.Deduplicator method)
(data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.ray_document_deduplicator.RayDocumentDeduplicator method)
(data_juicer.ops.deduplicator.ray_image_deduplicator.RayImageDeduplicator method)
(data_juicer.ops.deduplicator.ray_video_deduplicator.RayVideoDeduplicator method)
(data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.RayDocumentDeduplicator method)
(data_juicer.ops.deduplicator.RayImageDeduplicator method)
(data_juicer.ops.deduplicator.RayVideoDeduplicator method)
(data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method)
(data_juicer.ops.deduplicator.VideoDeduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method)
(data_juicer.ops.filter.AlphanumericFilter method)
(data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method)
(data_juicer.ops.filter.audio_nmf_snr_filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.audio_size_filter.AudioSizeFilter method)
(data_juicer.ops.filter.AudioDurationFilter method)
(data_juicer.ops.filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.AudioSizeFilter method)
(data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter method)
(data_juicer.ops.filter.FlaggedWordFilter method)
(data_juicer.ops.filter.image_aesthetics_filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.image_aspect_ratio_filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.image_face_count_filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.image_face_ratio_filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.image_nsfw_filter.ImageNSFWFilter method)
(data_juicer.ops.filter.image_pair_similarity_filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.image_shape_filter.ImageShapeFilter method)
(data_juicer.ops.filter.image_size_filter.ImageSizeFilter method)
(data_juicer.ops.filter.image_text_matching_filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.image_text_similarity_filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.image_watermark_filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.ImageNSFWFilter method)
(data_juicer.ops.filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.ImageShapeFilter method)
(data_juicer.ops.filter.ImageSizeFilter method)
(data_juicer.ops.filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.perplexity_filter.PerplexityFilter method)
(data_juicer.ops.filter.PerplexityFilter method)
(data_juicer.ops.filter.phrase_grounding_recall_filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.stopwords_filter.StopWordsFilter method)
(data_juicer.ops.filter.StopWordsFilter method)
(data_juicer.ops.filter.suffix_filter.SuffixFilter method)
(data_juicer.ops.filter.SuffixFilter method)
(data_juicer.ops.filter.text_action_filter.TextActionFilter method)
(data_juicer.ops.filter.text_entity_dependency_filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.text_length_filter.TextLengthFilter method)
(data_juicer.ops.filter.TextActionFilter method)
(data_juicer.ops.filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.TextLengthFilter method)
(data_juicer.ops.filter.token_num_filter.TokenNumFilter method)
(data_juicer.ops.filter.TokenNumFilter method)
(data_juicer.ops.filter.video_aesthetics_filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.video_aspect_ratio_filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.video_duration_filter.VideoDurationFilter method)
(data_juicer.ops.filter.video_frames_text_similarity_filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.video_motion_score_raft_filter.VideoMotionScoreRaftFilter method)
(data_juicer.ops.filter.video_nsfw_filter.VideoNSFWFilter method)
(data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.video_resolution_filter.VideoResolutionFilter method)
(data_juicer.ops.filter.video_tagging_from_frames_filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.video_watermark_filter.VideoWatermarkFilter method)
(data_juicer.ops.filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.VideoDurationFilter method)
(data_juicer.ops.filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoMotionScoreRaftFilter method)
(data_juicer.ops.filter.VideoNSFWFilter method)
(data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.VideoResolutionFilter method)
(data_juicer.ops.filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.VideoWatermarkFilter method)
(data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter method)
(data_juicer.ops.filter.WordRepetitionFilter method)
(data_juicer.ops.filter.words_num_filter.WordsNumFilter method)
(data_juicer.ops.filter.WordsNumFilter method)
(data_juicer.ops.Grouper method)
(data_juicer.ops.grouper.key_value_grouper.KeyValueGrouper method)
(data_juicer.ops.grouper.KeyValueGrouper method)
(data_juicer.ops.grouper.naive_grouper.NaiveGrouper method)
(data_juicer.ops.grouper.NaiveGrouper method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper.AudioFFmpegWrappedMapper method)
(data_juicer.ops.mapper.AudioFFmpegWrappedMapper method)
(data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.chinese_convert_mapper.ChineseConvertMapper method)
(data_juicer.ops.mapper.ChineseConvertMapper method)
(data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper method)
(data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper method)
(data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper method)
(data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper method)
(data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper method)
(data_juicer.ops.mapper.CleanCopyrightMapper method)
(data_juicer.ops.mapper.CleanEmailMapper method)
(data_juicer.ops.mapper.CleanHtmlMapper method)
(data_juicer.ops.mapper.CleanIpMapper method)
(data_juicer.ops.mapper.CleanLinksMapper method)
(data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper method)
(data_juicer.ops.mapper.ExpandMacroMapper method)
(data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper method)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.ExtractSupportTextMapper method)
(data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper method)
(data_juicer.ops.mapper.FixUnicodeMapper method)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.generate_qa_from_text_mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.image_blur_mapper.ImageBlurMapper method)
(data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper.ImageCaptioningFromGPT4VMapper method)
(data_juicer.ops.mapper.image_captioning_mapper.ImageCaptioningMapper method)
(data_juicer.ops.mapper.image_diffusion_mapper.ImageDiffusionMapper method)
(data_juicer.ops.mapper.image_face_blur_mapper.ImageFaceBlurMapper method)
(data_juicer.ops.mapper.image_tagging_mapper.ImageTaggingMapper method)
(data_juicer.ops.mapper.ImageBlurMapper method)
(data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method)
(data_juicer.ops.mapper.ImageCaptioningMapper method)
(data_juicer.ops.mapper.ImageDiffusionMapper method)
(data_juicer.ops.mapper.ImageFaceBlurMapper method)
(data_juicer.ops.mapper.ImageTaggingMapper method)
(data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper method)
(data_juicer.ops.mapper.NlpaugEnMapper method)
(data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper method)
(data_juicer.ops.mapper.NlpcdaZhMapper method)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper method)
(data_juicer.ops.mapper.PunctuationNormalizationMapper method)
(data_juicer.ops.mapper.python_file_mapper.PythonFileMapper method)
(data_juicer.ops.mapper.python_lambda_mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.PythonFileMapper method)
(data_juicer.ops.mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper method)
(data_juicer.ops.mapper.RelationIdentityMapper method)
(data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper method)
(data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper method)
(data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper method)
(data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper method)
(data_juicer.ops.mapper.remove_non_chinese_character_mapper.RemoveNonChineseCharacterlMapper method)
(data_juicer.ops.mapper.remove_repeat_sentences_mapper.RemoveRepeatSentencesMapper method)
(data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper method)
(data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper method)
(data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper method)
(data_juicer.ops.mapper.RemoveBibliographyMapper method)
(data_juicer.ops.mapper.RemoveCommentsMapper method)
(data_juicer.ops.mapper.RemoveHeaderMapper method)
(data_juicer.ops.mapper.RemoveLongWordsMapper method)
(data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper method)
(data_juicer.ops.mapper.RemoveRepeatSentencesMapper method)
(data_juicer.ops.mapper.RemoveSpecificCharsMapper method)
(data_juicer.ops.mapper.RemoveTableTextMapper method)
(data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method)
(data_juicer.ops.mapper.replace_content_mapper.ReplaceContentMapper method)
(data_juicer.ops.mapper.ReplaceContentMapper method)
(data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper method)
(data_juicer.ops.mapper.SentenceSplitMapper method)
(data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method)
(data_juicer.ops.mapper.TextChunkMapper method)
(data_juicer.ops.mapper.video_captioning_from_audio_mapper.VideoCaptioningFromAudioMapper method)
(data_juicer.ops.mapper.video_captioning_from_frames_mapper.VideoCaptioningFromFramesMapper method)
(data_juicer.ops.mapper.video_captioning_from_summarizer_mapper.VideoCaptioningFromSummarizerMapper method)
(data_juicer.ops.mapper.video_captioning_from_video_mapper.VideoCaptioningFromVideoMapper method)
(data_juicer.ops.mapper.video_extract_frames_mapper.VideoExtractFramesMapper method)
(data_juicer.ops.mapper.video_face_blur_mapper.VideoFaceBlurMapper method)
(data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper.VideoFFmpegWrappedMapper method)
(data_juicer.ops.mapper.video_remove_watermark_mapper.VideoRemoveWatermarkMapper method)
(data_juicer.ops.mapper.video_resize_aspect_ratio_mapper.VideoResizeAspectRatioMapper method)
(data_juicer.ops.mapper.video_resize_resolution_mapper.VideoResizeResolutionMapper method)
(data_juicer.ops.mapper.video_split_by_duration_mapper.VideoSplitByDurationMapper method)
(data_juicer.ops.mapper.video_split_by_key_frame_mapper.VideoSplitByKeyFrameMapper method)
(data_juicer.ops.mapper.video_split_by_scene_mapper.VideoSplitBySceneMapper method)
(data_juicer.ops.mapper.video_tagging_from_audio_mapper.VideoTaggingFromAudioMapper method)
(data_juicer.ops.mapper.video_tagging_from_frames_mapper.VideoTaggingFromFramesMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromAudioMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromFramesMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromVideoMapper method)
(data_juicer.ops.mapper.VideoExtractFramesMapper method)
(data_juicer.ops.mapper.VideoFaceBlurMapper method)
(data_juicer.ops.mapper.VideoFFmpegWrappedMapper method)
(data_juicer.ops.mapper.VideoRemoveWatermarkMapper method)
(data_juicer.ops.mapper.VideoResizeAspectRatioMapper method)
(data_juicer.ops.mapper.VideoResizeResolutionMapper method)
(data_juicer.ops.mapper.VideoSplitByDurationMapper method)
(data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
(data_juicer.ops.mapper.VideoSplitBySceneMapper method)
(data_juicer.ops.mapper.VideoTaggingFromAudioMapper method)
(data_juicer.ops.mapper.VideoTaggingFromFramesMapper method)
(data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper method)
(data_juicer.ops.mapper.WhitespaceNormalizationMapper method)
(data_juicer.ops.op_fusion.FusedFilter method)
(data_juicer.ops.Selector method)
(data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector method)
(data_juicer.ops.selector.FrequencySpecifiedFieldSelector method)
(data_juicer.ops.selector.random_selector.RandomSelector method)
(data_juicer.ops.selector.RandomSelector method)
(data_juicer.ops.selector.range_specified_field_selector.RangeSpecifiedFieldSelector method)
(data_juicer.ops.selector.RangeSpecifiedFieldSelector method)
(data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector method)
(data_juicer.ops.selector.TopkSpecifiedFieldSelector method)
(data_juicer.utils.auto_install_utils.AutoInstaller method)
(data_juicer.utils.cache_utils.DatasetCacheControl method)
(data_juicer.utils.ckpt_utils.CheckpointManager method)
(data_juicer.utils.compress.CacheCompressManager method)
(data_juicer.utils.compress.CompressManager method)
(data_juicer.utils.fingerprint_utils.Hasher method)
(data_juicer.utils.lazy_loader.LazyLoader method)
(data_juicer.utils.logger_utils.StreamToLoguru method)
(data_juicer.utils.model_utils.APIModel method)
(data_juicer.utils.registry.Registry method)
A
adapt_workloads() (data_juicer.core.Adapter method)
(data_juicer.core.adapter.Adapter method)
Adapter (class in data_juicer.core)
(class in data_juicer.core.adapter)
add_column() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
add_message() (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
add_parameters() (data_juicer.ops.base_op.OP method)
add_same_content_to_new_column() (in module data_juicer.core.data)
add_suffix_to_filename() (in module data_juicer.utils.file_utils)
add_suffixes() (in module data_juicer.format.formatter)
Aggregator (class in data_juicer.ops)
(class in data_juicer.ops.base_op)
alnum_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
alpha_token_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
AlphanumericFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.alphanumeric_filter)
analyze() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.diversity_analysis.DiversityAnalysis method)
(data_juicer.analysis.DiversityAnalysis method)
(data_juicer.analysis.overall_analysis.OverallAnalysis method)
(data_juicer.analysis.OverallAnalysis method)
analyze_resource_util_list() (data_juicer.core.Monitor static method)
(data_juicer.core.monitor.Monitor static method)
analyze_single_resource_util() (data_juicer.core.Monitor static method)
(data_juicer.core.monitor.Monitor static method)
analyze_small_batch() (data_juicer.core.Adapter method)
(data_juicer.core.adapter.Adapter method)
Analyzer (class in data_juicer.core)
(class in data_juicer.core.analyzer)
APIModel (class in data_juicer.utils.model_utils)
aspect_ratios (data_juicer.utils.constant.StatsKeysConstant attribute)
assertDatasetEqual() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase method)
attribute_descriptions (data_juicer.utils.constant.Fields attribute)
attribute_summary() (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.EntityAttributeAggregator method)
attribute_support_texts (data_juicer.utils.constant.Fields attribute)
attributes (data_juicer.utils.constant.Fields attribute)
audio (data_juicer.utils.mm_utils.SpecialTokens attribute)
audio_duration (data_juicer.utils.constant.StatsKeysConstant attribute)
audio_nmf_snr (data_juicer.utils.constant.StatsKeysConstant attribute)
audio_sizes (data_juicer.utils.constant.StatsKeysConstant attribute)
AudioDurationFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.audio_duration_filter)
AudioFFmpegWrappedMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper)
AudioNMFSNRFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.audio_nmf_snr_filter)
AudioSizeFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.audio_size_filter)
AutoInstaller (class in data_juicer.utils.auto_install_utils)
AV_STREAM_THREAD_TYPE (in module data_juicer.utils.mm_utils)
avaliable_detectors (data_juicer.ops.mapper.video_split_by_scene_mapper.VideoSplitBySceneMapper attribute)
(data_juicer.ops.mapper.VideoSplitBySceneMapper attribute)
AverageLineLengthFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.average_line_length_filter)
avg_line_length (data_juicer.utils.constant.StatsKeysConstant attribute)
avg_split_string_list_under_limit() (in module data_juicer.utils.common_utils)
B
BaseCompressor (class in data_juicer.utils.compress)
BaseFormatter (class in data_juicer.format.formatter)
batch_size_strategy() (data_juicer.core.Adapter method)
(data_juicer.core.adapter.Adapter method)
build_input() (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
C
CacheCompressManager (class in data_juicer.utils.compress)
calculate_hash() (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.ray_document_deduplicator.RayDocumentDeduplicator method)
(data_juicer.ops.deduplicator.ray_image_deduplicator.RayImageDeduplicator method)
(data_juicer.ops.deduplicator.ray_video_deduplicator.RayVideoDeduplicator method)
(data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.RayDocumentDeduplicator method)
(data_juicer.ops.deduplicator.RayImageDeduplicator method)
(data_juicer.ops.deduplicator.RayVideoDeduplicator method)
calculate_np() (in module data_juicer.utils.process_utils)
calculate_resized_dimensions() (in module data_juicer.utils.mm_utils)
CalibrateQAMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.calibrate_qa_mapper)
CalibrateQueryMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.calibrate_query_mapper)
CalibrateResponseMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.calibrate_response_mapper)
call_gpt_vision_api() (in module data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper)
catch_map_batches_exception() (in module data_juicer.ops.base_op)
catch_map_single_exception() (in module data_juicer.ops.base_op)
category_to_hist() (data_juicer.analysis.measure.RelatedTTestMeasure static method)
char_rep_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
CharacterRepetitionFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.character_repetition_filter)
check() (data_juicer.utils.auto_install_utils.AutoInstaller method)
check_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method)
check_model() (in module data_juicer.utils.model_utils)
check_ops_to_skip() (data_juicer.utils.ckpt_utils.CheckpointManager method)
CheckpointManager (class in data_juicer.utils.ckpt_utils)
ChineseConvertMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.chinese_convert_mapper)
CleanCopyrightMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.clean_copyright_mapper)
CleanEmailMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.clean_email_mapper)
CleanHtmlMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.clean_html_mapper)
CleanIpMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.clean_ip_mapper)
CleanLinksMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.clean_links_mapper)
cleanup_cache_files() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
(data_juicer.utils.compress.CacheCompressManager method)
cleanup_compressed_cache_files() (in module data_juicer.utils.compress)
close_video() (in module data_juicer.utils.mm_utils)
collect() (data_juicer.analysis.collector.TextTokenDistCollector method)
ColumnWiseAnalysis (class in data_juicer.analysis)
(class in data_juicer.analysis.column_wise_analysis)
compress() (data_juicer.utils.compress.BaseCompressor static method)
(data_juicer.utils.compress.CacheCompressManager method)
(data_juicer.utils.compress.CompressManager method)
(data_juicer.utils.compress.Compressor class method)
(data_juicer.utils.compress.GzipCompressor static method)
(data_juicer.utils.compress.Lz4Compressor static method)
(data_juicer.utils.compress.ZstdCompressor static method)
(in module data_juicer.utils.compress)
CompressionOff (class in data_juicer.utils.compress)
CompressManager (class in data_juicer.utils.compress)
Compressor (class in data_juicer.utils.compress)
compressors (data_juicer.utils.compress.Compressor attribute)
compute() (data_juicer.analysis.diversity_analysis.DiversityAnalysis method)
(data_juicer.analysis.DiversityAnalysis method)
compute_flow() (data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.video_motion_score_raft_filter.VideoMotionScoreRaftFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoMotionScoreRaftFilter method)
compute_hash() (data_juicer.ops.base_op.Deduplicator method)
(data_juicer.ops.Deduplicator method)
(data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method)
(data_juicer.ops.deduplicator.VideoDeduplicator method)
compute_nmf_snr() (in module data_juicer.ops.filter.audio_nmf_snr_filter)
compute_stats_batched() (data_juicer.ops.base_op.Filter method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method)
(data_juicer.ops.filter.AlphanumericFilter method)
(data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter method)
(data_juicer.ops.filter.FlaggedWordFilter method)
(data_juicer.ops.filter.image_aspect_ratio_filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.perplexity_filter.PerplexityFilter method)
(data_juicer.ops.filter.PerplexityFilter method)
(data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.text_length_filter.TextLengthFilter method)
(data_juicer.ops.filter.TextLengthFilter method)
(data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter method)
(data_juicer.ops.filter.WordRepetitionFilter method)
(data_juicer.ops.filter.words_num_filter.WordsNumFilter method)
(data_juicer.ops.filter.WordsNumFilter method)
(data_juicer.ops.op_fusion.FusedFilter method)
compute_stats_single() (data_juicer.ops.base_op.Filter method)
(data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method)
(data_juicer.ops.filter.audio_nmf_snr_filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.audio_size_filter.AudioSizeFilter method)
(data_juicer.ops.filter.AudioDurationFilter method)
(data_juicer.ops.filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.AudioSizeFilter method)
(data_juicer.ops.filter.image_aesthetics_filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.image_face_count_filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.image_face_ratio_filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.image_nsfw_filter.ImageNSFWFilter method)
(data_juicer.ops.filter.image_pair_similarity_filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.image_shape_filter.ImageShapeFilter method)
(data_juicer.ops.filter.image_size_filter.ImageSizeFilter method)
(data_juicer.ops.filter.image_text_matching_filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.image_text_similarity_filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.image_watermark_filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.ImageNSFWFilter method)
(data_juicer.ops.filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.ImageShapeFilter method)
(data_juicer.ops.filter.ImageSizeFilter method)
(data_juicer.ops.filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.phrase_grounding_recall_filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.stopwords_filter.StopWordsFilter method)
(data_juicer.ops.filter.StopWordsFilter method)
(data_juicer.ops.filter.suffix_filter.SuffixFilter method)
(data_juicer.ops.filter.SuffixFilter method)
(data_juicer.ops.filter.text_action_filter.TextActionFilter method)
(data_juicer.ops.filter.text_entity_dependency_filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.TextActionFilter method)
(data_juicer.ops.filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.token_num_filter.TokenNumFilter method)
(data_juicer.ops.filter.TokenNumFilter method)
(data_juicer.ops.filter.video_aesthetics_filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.video_aspect_ratio_filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.video_duration_filter.VideoDurationFilter method)
(data_juicer.ops.filter.video_frames_text_similarity_filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.video_nsfw_filter.VideoNSFWFilter method)
(data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.video_resolution_filter.VideoResolutionFilter method)
(data_juicer.ops.filter.video_tagging_from_frames_filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.video_watermark_filter.VideoWatermarkFilter method)
(data_juicer.ops.filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.VideoDurationFilter method)
(data_juicer.ops.filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoNSFWFilter method)
(data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.VideoResolutionFilter method)
(data_juicer.ops.filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.VideoWatermarkFilter method)
config_backup() (in module data_juicer.config.config)
context (data_juicer.utils.constant.Fields attribute)
convert_arrow_to_python() (in module data_juicer.ops.base_op)
convert_dict_list_to_list_dict() (in module data_juicer.ops.base_op)
convert_list_dict_to_dict_list() (in module data_juicer.ops.base_op)
convert_to_absolute_paths() (in module data_juicer.core.ray_data)
copy_data() (in module data_juicer.utils.file_utils)
create_directory_if_not_exists() (in module data_juicer.utils.file_utils)
create_replacer() (in module data_juicer.ops.mapper.video_split_by_duration_mapper)
(in module data_juicer.ops.mapper.video_split_by_key_frame_mapper)
CrossEntropyMeasure (class in data_juicer.analysis.measure)
CsvFormatter (class in data_juicer.format)
(class in data_juicer.format.csv_formatter)
cuda_device_count() (in module data_juicer)
cut_video_by_seconds() (in module data_juicer.utils.mm_utils)
D
data_juicer
module
data_juicer.analysis
module
data_juicer.analysis.collector
module
data_juicer.analysis.column_wise_analysis
module
data_juicer.analysis.diversity_analysis
module
data_juicer.analysis.draw
module
data_juicer.analysis.measure
module
data_juicer.analysis.overall_analysis
module
data_juicer.config
module
data_juicer.config.config
module
data_juicer.core
module
data_juicer.core.adapter
module
data_juicer.core.analyzer
module
data_juicer.core.data
module
data_juicer.core.executor
module
data_juicer.core.exporter
module
data_juicer.core.monitor
module
data_juicer.core.ray_data
module
data_juicer.core.ray_executor
module
data_juicer.core.tracer
module
data_juicer.format
module
data_juicer.format.csv_formatter
module
data_juicer.format.empty_formatter
module
data_juicer.format.formatter
module
data_juicer.format.json_formatter
module
data_juicer.format.load
module
data_juicer.format.mixture_formatter
module
data_juicer.format.parquet_formatter
module
data_juicer.format.text_formatter
module
data_juicer.format.tsv_formatter
module
data_juicer.ops
module
data_juicer.ops.aggregator
module
data_juicer.ops.aggregator.entity_attribute_aggregator
module
data_juicer.ops.aggregator.most_relavant_entities_aggregator
module
data_juicer.ops.aggregator.nested_aggregator
module
data_juicer.ops.base_op
module
data_juicer.ops.common
module
data_juicer.ops.common.helper_func
module
data_juicer.ops.common.special_characters
module
data_juicer.ops.deduplicator
module
data_juicer.ops.deduplicator.document_deduplicator
module
data_juicer.ops.deduplicator.document_minhash_deduplicator
module
data_juicer.ops.deduplicator.document_simhash_deduplicator
module
data_juicer.ops.deduplicator.image_deduplicator
module
data_juicer.ops.deduplicator.ray_basic_deduplicator
module
data_juicer.ops.deduplicator.ray_document_deduplicator
module
data_juicer.ops.deduplicator.ray_image_deduplicator
module
data_juicer.ops.deduplicator.ray_video_deduplicator
module
data_juicer.ops.deduplicator.video_deduplicator
module
data_juicer.ops.filter
module
data_juicer.ops.filter.alphanumeric_filter
module
data_juicer.ops.filter.audio_duration_filter
module
data_juicer.ops.filter.audio_nmf_snr_filter
module
data_juicer.ops.filter.audio_size_filter
module
data_juicer.ops.filter.average_line_length_filter
module
data_juicer.ops.filter.character_repetition_filter
module
data_juicer.ops.filter.flagged_words_filter
module
data_juicer.ops.filter.image_aesthetics_filter
module
data_juicer.ops.filter.image_aspect_ratio_filter
module
data_juicer.ops.filter.image_face_count_filter
module
data_juicer.ops.filter.image_face_ratio_filter
module
data_juicer.ops.filter.image_nsfw_filter
module
data_juicer.ops.filter.image_pair_similarity_filter
module
data_juicer.ops.filter.image_shape_filter
module
data_juicer.ops.filter.image_size_filter
module
data_juicer.ops.filter.image_text_matching_filter
module
data_juicer.ops.filter.image_text_similarity_filter
module
data_juicer.ops.filter.image_watermark_filter
module
data_juicer.ops.filter.language_id_score_filter
module
data_juicer.ops.filter.maximum_line_length_filter
module
data_juicer.ops.filter.perplexity_filter
module
data_juicer.ops.filter.phrase_grounding_recall_filter
module
data_juicer.ops.filter.special_characters_filter
module
data_juicer.ops.filter.specified_field_filter
module
data_juicer.ops.filter.specified_numeric_field_filter
module
data_juicer.ops.filter.stopwords_filter
module
data_juicer.ops.filter.suffix_filter
module
data_juicer.ops.filter.text_action_filter
module
data_juicer.ops.filter.text_entity_dependency_filter
module
data_juicer.ops.filter.text_length_filter
module
data_juicer.ops.filter.token_num_filter
module
data_juicer.ops.filter.video_aesthetics_filter
module
data_juicer.ops.filter.video_aspect_ratio_filter
module
data_juicer.ops.filter.video_duration_filter
module
data_juicer.ops.filter.video_frames_text_similarity_filter
module
data_juicer.ops.filter.video_motion_score_filter
module
data_juicer.ops.filter.video_motion_score_raft_filter
module
data_juicer.ops.filter.video_nsfw_filter
module
data_juicer.ops.filter.video_ocr_area_ratio_filter
module
data_juicer.ops.filter.video_resolution_filter
module
data_juicer.ops.filter.video_tagging_from_frames_filter
module
data_juicer.ops.filter.video_watermark_filter
module
data_juicer.ops.filter.word_repetition_filter
module
data_juicer.ops.filter.words_num_filter
module
data_juicer.ops.grouper
module
data_juicer.ops.grouper.key_value_grouper
module
data_juicer.ops.grouper.naive_grouper
module
data_juicer.ops.load
module
data_juicer.ops.mapper
module
data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper
module
data_juicer.ops.mapper.calibrate_qa_mapper
module
data_juicer.ops.mapper.calibrate_query_mapper
module
data_juicer.ops.mapper.calibrate_response_mapper
module
data_juicer.ops.mapper.chinese_convert_mapper
module
data_juicer.ops.mapper.clean_copyright_mapper
module
data_juicer.ops.mapper.clean_email_mapper
module
data_juicer.ops.mapper.clean_html_mapper
module
data_juicer.ops.mapper.clean_ip_mapper
module
data_juicer.ops.mapper.clean_links_mapper
module
data_juicer.ops.mapper.expand_macro_mapper
module
data_juicer.ops.mapper.extract_entity_attribute_mapper
module
data_juicer.ops.mapper.extract_entity_relation_mapper
module
data_juicer.ops.mapper.extract_event_mapper
module
data_juicer.ops.mapper.extract_keyword_mapper
module
data_juicer.ops.mapper.extract_nickname_mapper
module
data_juicer.ops.mapper.extract_support_text_mapper
module
data_juicer.ops.mapper.fix_unicode_mapper
module
data_juicer.ops.mapper.generate_qa_from_examples_mapper
module
data_juicer.ops.mapper.generate_qa_from_text_mapper
module
data_juicer.ops.mapper.image_blur_mapper
module
data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper
module
data_juicer.ops.mapper.image_captioning_mapper
module
data_juicer.ops.mapper.image_diffusion_mapper
module
data_juicer.ops.mapper.image_face_blur_mapper
module
data_juicer.ops.mapper.image_tagging_mapper
module
data_juicer.ops.mapper.nlpaug_en_mapper
module
data_juicer.ops.mapper.nlpcda_zh_mapper
module
data_juicer.ops.mapper.optimize_qa_mapper
module
data_juicer.ops.mapper.optimize_query_mapper
module
data_juicer.ops.mapper.optimize_response_mapper
module
data_juicer.ops.mapper.pair_preference_mapper
module
data_juicer.ops.mapper.punctuation_normalization_mapper
module
data_juicer.ops.mapper.python_file_mapper
module
data_juicer.ops.mapper.python_lambda_mapper
module
data_juicer.ops.mapper.relation_identity_mapper
module
data_juicer.ops.mapper.remove_bibliography_mapper
module
data_juicer.ops.mapper.remove_comments_mapper
module
data_juicer.ops.mapper.remove_header_mapper
module
data_juicer.ops.mapper.remove_long_words_mapper
module
data_juicer.ops.mapper.remove_non_chinese_character_mapper
module
data_juicer.ops.mapper.remove_repeat_sentences_mapper
module
data_juicer.ops.mapper.remove_specific_chars_mapper
module
data_juicer.ops.mapper.remove_table_text_mapper
module
data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper
module
data_juicer.ops.mapper.replace_content_mapper
module
data_juicer.ops.mapper.sentence_split_mapper
module
data_juicer.ops.mapper.text_chunk_mapper
module
data_juicer.ops.mapper.video_captioning_from_audio_mapper
module
data_juicer.ops.mapper.video_captioning_from_frames_mapper
module
data_juicer.ops.mapper.video_captioning_from_summarizer_mapper
module
data_juicer.ops.mapper.video_captioning_from_video_mapper
module
data_juicer.ops.mapper.video_extract_frames_mapper
module
data_juicer.ops.mapper.video_face_blur_mapper
module
data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper
module
data_juicer.ops.mapper.video_remove_watermark_mapper
module
data_juicer.ops.mapper.video_resize_aspect_ratio_mapper
module
data_juicer.ops.mapper.video_resize_resolution_mapper
module
data_juicer.ops.mapper.video_split_by_duration_mapper
module
data_juicer.ops.mapper.video_split_by_key_frame_mapper
module
data_juicer.ops.mapper.video_split_by_scene_mapper
module
data_juicer.ops.mapper.video_tagging_from_audio_mapper
module
data_juicer.ops.mapper.video_tagging_from_frames_mapper
module
data_juicer.ops.mapper.whitespace_normalization_mapper
module
data_juicer.ops.op_fusion
module
data_juicer.ops.selector
module
data_juicer.ops.selector.frequency_specified_field_selector
module
data_juicer.ops.selector.random_selector
module
data_juicer.ops.selector.range_specified_field_selector
module
data_juicer.ops.selector.topk_specified_field_selector
module
data_juicer.tools
module
data_juicer.utils
module
data_juicer.utils.asset_utils
module
data_juicer.utils.auto_install_mapping
module
data_juicer.utils.auto_install_utils
module
data_juicer.utils.availability_utils
module
data_juicer.utils.cache_utils
module
data_juicer.utils.ckpt_utils
module
data_juicer.utils.common_utils
module
data_juicer.utils.compress
module
data_juicer.utils.constant
module
data_juicer.utils.file_utils
module
data_juicer.utils.fingerprint_utils
module
data_juicer.utils.lazy_loader
module
data_juicer.utils.logger_utils
module
data_juicer.utils.mm_utils
module
data_juicer.utils.model_utils
module
data_juicer.utils.process_utils
module
data_juicer.utils.registry
module
data_juicer.utils.resource_utils
module
data_juicer.utils.unittest_utils
module
DataJuicerTestCaseBase (class in data_juicer.utils.unittest_utils)
dataset_cache_control() (in module data_juicer.utils.cache_utils)
DatasetCacheControl (class in data_juicer.utils.cache_utils)
decompress() (data_juicer.utils.compress.CacheCompressManager method)
(data_juicer.utils.compress.CompressManager method)
(in module data_juicer.utils.compress)
Deduplicator (class in data_juicer.ops)
(class in data_juicer.ops.base_op)
DEFAULT_ATTR_PATTERN_TEMPLATE (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
DEFAULT_COMPLETION_DELIMITER (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractKeywordMapper attribute)
DEFAULT_CONTINUE_PROMPT (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_DEMON_PATTERN (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
DEFAULT_ENTITY_PATTERN (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_ENTITY_TYPES (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_EXAMPLE_PROMPT (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
DEFAULT_EXAMPLE_TEMPLATE (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
DEFAULT_IF_LOOP_PROMPT (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_INPUT_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.most_relavant_entities_aggregator.MostRelavantEntitiesAggregator attribute)
(data_juicer.ops.aggregator.MostRelavantEntitiesAggregator attribute)
(data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute)
(data_juicer.ops.aggregator.NestedAggregator attribute)
(data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper attribute)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.ExtractSupportTextMapper attribute)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper attribute)
(data_juicer.ops.mapper.PairPreferenceMapper attribute)
(data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper attribute)
(data_juicer.ops.mapper.RelationIdentityMapper attribute)
DEFAULT_OUTPUT_PATTERN (data_juicer.ops.aggregator.most_relavant_entities_aggregator.MostRelavantEntitiesAggregator attribute)
(data_juicer.ops.aggregator.MostRelavantEntitiesAggregator attribute)
(data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper attribute)
(data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.ExtractKeywordMapper attribute)
(data_juicer.ops.mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper attribute)
(data_juicer.ops.mapper.PairPreferenceMapper attribute)
DEFAULT_OUTPUT_PATTERN_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper attribute)
(data_juicer.ops.mapper.RelationIdentityMapper attribute)
DEFAULT_PROMPT_TEMPLATE (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractKeywordMapper attribute)
DEFAULT_QA_PAIR_TEMPLATE (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
DEFAULT_RECORD_DELIMITER (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_REFERENCE_TEMPLATE (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
DEFAULT_RELATION_PATTERN (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_SUB_DOC_TEMPLATE (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute)
(data_juicer.ops.aggregator.NestedAggregator attribute)
DEFAULT_SYSTEM_PROMPT (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute)
(data_juicer.ops.aggregator.NestedAggregator attribute)
(data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.calibrate_query_mapper.CalibrateQueryMapper attribute)
(data_juicer.ops.mapper.calibrate_response_mapper.CalibrateResponseMapper attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.CalibrateQueryMapper attribute)
(data_juicer.ops.mapper.CalibrateResponseMapper attribute)
(data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper attribute)
(data_juicer.ops.mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.ExtractSupportTextMapper attribute)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.optimize_query_mapper.OptimizeQueryMapper attribute)
(data_juicer.ops.mapper.optimize_response_mapper.OptimizeResponseMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.OptimizeQueryMapper attribute)
(data_juicer.ops.mapper.OptimizeResponseMapper attribute)
(data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper attribute)
(data_juicer.ops.mapper.PairPreferenceMapper attribute)
DEFAULT_SYSTEM_PROMPT_TEMPLATE (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper attribute)
(data_juicer.ops.mapper.RelationIdentityMapper attribute)
DEFAULT_SYSTEM_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.most_relavant_entities_aggregator.MostRelavantEntitiesAggregator attribute)
(data_juicer.ops.aggregator.MostRelavantEntitiesAggregator attribute)
DEFAULT_TUPLE_DELIMITER (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
detect_faces() (in module data_juicer.utils.mm_utils)
dict_to_hash() (in module data_juicer.utils.common_utils)
dispatch (data_juicer.utils.fingerprint_utils.Hasher attribute)
display_config() (in module data_juicer.config.config)
DiversityAnalysis (class in data_juicer.analysis)
(class in data_juicer.analysis.diversity_analysis)
dj_configs (data_juicer.utils.constant.JobRequiredKeys attribute)
DJDataset (class in data_juicer.core.data)
DocumentDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.document_deduplicator)
DocumentMinhashDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.document_minhash_deduplicator)
DocumentSimhashDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.document_simhash_deduplicator)
draw_box() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.ColumnWiseAnalysis method)
draw_heatmap() (in module data_juicer.analysis.draw)
draw_hist() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.ColumnWiseAnalysis method)
draw_resource_util_graph() (data_juicer.core.Monitor static method)
(data_juicer.core.monitor.Monitor static method)
draw_wordcloud() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.ColumnWiseAnalysis method)
DYNAMIC_FIELDS (data_juicer.core.Monitor attribute)
(data_juicer.core.monitor.Monitor attribute)
E
EMPTY_HASH_VALUE (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator attribute)
(data_juicer.ops.deduplicator.RayBasicDeduplicator attribute)
empty_history() (data_juicer.ops.base_op.OP method)
EmptyFormatter (class in data_juicer.format)
(class in data_juicer.format.empty_formatter)
entity (data_juicer.utils.constant.Fields attribute)
entity_description (data_juicer.utils.constant.Fields attribute)
entity_name (data_juicer.utils.constant.Fields attribute)
entity_type (data_juicer.utils.constant.Fields attribute)
EntityAttributeAggregator (class in data_juicer.ops.aggregator)
(class in data_juicer.ops.aggregator.entity_attribute_aggregator)
EntropyMeasure (class in data_juicer.analysis.measure)
eoc (data_juicer.utils.mm_utils.SpecialTokens attribute)
event_description (data_juicer.utils.constant.Fields attribute)
execute_and_probe() (data_juicer.core.Adapter static method)
(data_juicer.core.adapter.Adapter static method)
Executor (class in data_juicer.core)
(class in data_juicer.core.executor)
ExpandMacroMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.expand_macro_mapper)
export() (data_juicer.core.Exporter method)
(data_juicer.core.exporter.Exporter method)
export_compute_stats() (data_juicer.core.Exporter method)
(data_juicer.core.exporter.Exporter method)
export_config() (in module data_juicer.config)
(in module data_juicer.config.config)
Exporter (class in data_juicer.core)
(class in data_juicer.core.exporter)
extra_configs (data_juicer.utils.constant.JobRequiredKeys attribute)
extract() (data_juicer.utils.compress.Extractor class method)
extract_audio_from_video() (in module data_juicer.utils.mm_utils)
extract_key_frames() (in module data_juicer.utils.mm_utils)
extract_key_frames_by_seconds() (in module data_juicer.utils.mm_utils)
extract_txt_from_docx() (in module data_juicer.format.text_formatter)
extract_txt_from_pdf() (in module data_juicer.format.text_formatter)
extract_video_frames_uniformly() (in module data_juicer.utils.mm_utils)
extract_video_frames_uniformly_by_seconds() (in module data_juicer.utils.mm_utils)
ExtractEntityAttributeMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.extract_entity_attribute_mapper)
ExtractEntityRelationMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.extract_entity_relation_mapper)
ExtractEventMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.extract_event_mapper)
ExtractKeywordMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.extract_keyword_mapper)
ExtractNicknameMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.extract_nickname_mapper)
Extractor (class in data_juicer.utils.compress)
ExtractSupportTextMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.extract_support_text_mapper)
F
face_counts (data_juicer.utils.constant.StatsKeysConstant attribute)
face_detections (data_juicer.utils.constant.StatsKeysConstant attribute)
face_ratios (data_juicer.utils.constant.StatsKeysConstant attribute)
Fields (class in data_juicer.utils.constant)
FileLock (class in data_juicer.utils.compress)
Filter (class in data_juicer.ops)
(class in data_juicer.ops.base_op)
filter() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
filter_batch() (in module data_juicer.core.ray_data)
find() (data_juicer.ops.common.helper_func.UnionFind method)
find_files_with_suffix() (in module data_juicer.utils.file_utils)
find_noun_phrases() (in module data_juicer.ops.filter.phrase_grounding_recall_filter)
find_root_verb_and_its_dobj() (in module data_juicer.analysis.diversity_analysis)
find_root_verb_and_its_dobj_in_string() (in module data_juicer.analysis.diversity_analysis)
FixUnicodeMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.fix_unicode_mapper)
flagged_words_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
FlaggedWordFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.flagged_words_filter)
flush() (data_juicer.utils.logger_utils.StreamToLoguru method)
follow_read() (in module data_juicer.utils.file_utils)
format_cache_file_name() (data_juicer.utils.compress.CacheCompressManager method)
free_models() (in module data_juicer.utils.model_utils)
FrequencySpecifiedFieldSelector (class in data_juicer.ops.selector)
(class in data_juicer.ops.selector.frequency_specified_field_selector)
from_dict() (data_juicer.core.data.NestedDataset class method)
(data_juicer.core.NestedDataset class method)
fuse_filter_group() (in module data_juicer.ops.op_fusion)
fuse_operators() (in module data_juicer.ops.op_fusion)
FusedFilter (class in data_juicer.ops.op_fusion)
G
generate_dataset() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase method)
generate_fingerprint() (in module data_juicer.utils.fingerprint_utils)
GenerateQAFromExamplesMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.generate_qa_from_examples_mapper)
GenerateQAFromTextMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.generate_qa_from_text_mapper)
get() (data_juicer.utils.registry.Registry method)
get_abs_path() (in module data_juicer.core.ray_data)
get_access_log() (data_juicer.utils.constant.StatsKeysMeta method)
get_backup_model_link() (in module data_juicer.utils.model_utils)
get_caller_name() (in module data_juicer.utils.logger_utils)
get_cpu_count() (in module data_juicer.utils.resource_utils)
get_cpu_utilization() (in module data_juicer.utils.resource_utils)
get_decoded_frames_from_video() (in module data_juicer.utils.mm_utils)
get_diversity() (in module data_juicer.analysis.diversity_analysis)
get_file_size() (in module data_juicer.utils.mm_utils)
get_hash_method() (in module data_juicer.ops.deduplicator.image_deduplicator)
(in module data_juicer.ops.deduplicator.ray_image_deduplicator)
get_init_configs() (in module data_juicer.config)
(in module data_juicer.config.config)
get_key_frame_seconds() (in module data_juicer.utils.mm_utils)
get_left_process_list() (data_juicer.utils.ckpt_utils.CheckpointManager method)
get_log_file_path() (in module data_juicer.utils.logger_utils)
get_min_cuda_memory() (in module data_juicer.utils.process_utils)
get_model() (in module data_juicer.utils.model_utils)
get_num_gpus() (in module data_juicer.core.ray_data)
get_reader() (data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
get_row_col() (in module data_juicer.analysis.column_wise_analysis)
get_sentences_from_document() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
get_special_tokens() (in module data_juicer.utils.mm_utils)
get_split_key_frame() (data_juicer.ops.mapper.video_split_by_key_frame_mapper.VideoSplitByKeyFrameMapper method)
(data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
get_text_chunks() (data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method)
(data_juicer.ops.mapper.TextChunkMapper method)
get_video_duration() (in module data_juicer.utils.mm_utils)
get_words_from_document() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
getvalue() (data_juicer.utils.logger_utils.StreamToLoguru method)
GiB (data_juicer.core.Exporter attribute)
(data_juicer.core.exporter.Exporter attribute)
Grouper (class in data_juicer.ops)
(class in data_juicer.ops.base_op)
GzipCompressor (class in data_juicer.utils.compress)
H
hash (data_juicer.utils.constant.HashKeys attribute)
hash() (data_juicer.utils.fingerprint_utils.Hasher class method)
hash_bytes() (data_juicer.utils.fingerprint_utils.Hasher class method)
hash_default() (data_juicer.utils.fingerprint_utils.Hasher class method)
Hasher (class in data_juicer.utils.fingerprint_utils)
HashKeys (class in data_juicer.utils.constant)
hexdigest() (data_juicer.utils.fingerprint_utils.Hasher method)
HiddenPrints (class in data_juicer.utils.logger_utils)
hook (data_juicer.utils.constant.JobRequiredKeys attribute)
I
image (data_juicer.utils.mm_utils.SpecialTokens attribute)
image_aesthetics_scores (data_juicer.utils.constant.StatsKeysConstant attribute)
image_byte_to_base64() (in module data_juicer.utils.mm_utils)
image_height (data_juicer.utils.constant.StatsKeysConstant attribute)
image_nsfw_score (data_juicer.utils.constant.StatsKeysConstant attribute)
image_pair_similarity (data_juicer.utils.constant.StatsKeysConstant attribute)
image_path_to_base64() (in module data_juicer.utils.mm_utils)
image_sizes (data_juicer.utils.constant.StatsKeysConstant attribute)
image_tags (data_juicer.utils.constant.Fields attribute)
image_text_matching_score (data_juicer.utils.constant.StatsKeysConstant attribute)
image_text_similarity (data_juicer.utils.constant.StatsKeysConstant attribute)
image_watermark_prob (data_juicer.utils.constant.StatsKeysConstant attribute)
image_width (data_juicer.utils.constant.StatsKeysConstant attribute)
ImageAestheticsFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_aesthetics_filter)
ImageAspectRatioFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_aspect_ratio_filter)
ImageBlurMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.image_blur_mapper)
ImageCaptioningFromGPT4VMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper)
ImageCaptioningMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.image_captioning_mapper)
ImageDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.image_deduplicator)
ImageDiffusionMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.image_diffusion_mapper)
ImageFaceBlurMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.image_face_blur_mapper)
ImageFaceCountFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_face_count_filter)
ImageFaceRatioFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_face_ratio_filter)
imagehash (data_juicer.utils.constant.HashKeys attribute)
ImageNSFWFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_nsfw_filter)
ImagePairSimilarityFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_pair_similarity_filter)
ImageShapeFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_shape_filter)
ImageSizeFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_size_filter)
ImageTaggingMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.image_tagging_mapper)
ImageTextMatchingFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_text_matching_filter)
ImageTextSimilarityFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_text_similarity_filter)
ImageWatermarkFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.image_watermark_filter)
init_configs() (in module data_juicer.config)
(in module data_juicer.config.config)
init_setup_from_cfg() (in module data_juicer.config.config)
insert_texts_after_placeholders() (in module data_juicer.utils.mm_utils)
insight_mining() (data_juicer.core.Adapter method)
(data_juicer.core.adapter.Adapter method)
install() (data_juicer.utils.auto_install_utils.AutoInstaller method)
InterVars (class in data_juicer.utils.constant)
iou() (in module data_juicer.utils.mm_utils)
is_absolute_path() (in module data_juicer.utils.file_utils)
is_batched_op() (data_juicer.ops.base_op.OP method)
is_cuda_available() (in module data_juicer)
is_duplicate (data_juicer.utils.constant.HashKeys attribute)
is_float() (in module data_juicer.utils.common_utils)
is_number() (in module data_juicer.ops.filter.specified_numeric_field_filter)
is_string_list() (in module data_juicer.utils.common_utils)
J
JobRequiredKeys (class in data_juicer.utils.constant)
JSDivMeasure (class in data_juicer.analysis.measure)
JsonFormatter (class in data_juicer.format)
(class in data_juicer.format.json_formatter)
JSONStreamDatasource (class in data_juicer.core.ray_data)
K
KeyValueGrouper (class in data_juicer.ops.grouper)
(class in data_juicer.ops.grouper.key_value_grouper)
keyword (data_juicer.utils.constant.Fields attribute)
KiB (data_juicer.core.Exporter attribute)
(data_juicer.core.exporter.Exporter attribute)
KLDivMeasure (class in data_juicer.analysis.measure)
L
lang (data_juicer.utils.constant.StatsKeysConstant attribute)
lang_score (data_juicer.utils.constant.StatsKeysConstant attribute)
LanguageIDScoreFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.language_id_score_filter)
LazyLoader (class in data_juicer.utils.lazy_loader)
light_rag_extraction() (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
lines (data_juicer.utils.constant.InterVars attribute)
list() (data_juicer.utils.registry.Registry method)
load_audio() (in module data_juicer.utils.mm_utils)
load_audios() (in module data_juicer.utils.mm_utils)
load_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method)
load_data_with_context() (in module data_juicer.utils.mm_utils)
load_dataset() (data_juicer.format.empty_formatter.EmptyFormatter method)
(data_juicer.format.empty_formatter.RayEmptyFormatter method)
(data_juicer.format.EmptyFormatter method)
(data_juicer.format.formatter.BaseFormatter method)
(data_juicer.format.formatter.LocalFormatter method)
(data_juicer.format.formatter.RemoteFormatter method)
(data_juicer.format.LocalFormatter method)
(data_juicer.format.mixture_formatter.MixtureFormatter method)
(data_juicer.format.MixtureFormatter method)
(data_juicer.format.RayEmptyFormatter method)
(data_juicer.format.RemoteFormatter method)
(data_juicer.format.text_formatter.TextFormatter method)
(data_juicer.format.TextFormatter method)
load_formatter() (in module data_juicer.format)
(in module data_juicer.format.formatter)
(in module data_juicer.format.load)
load_from_disk() (data_juicer.core.data.NestedDataset static method)
(data_juicer.core.NestedDataset static method)
load_image() (in module data_juicer.utils.mm_utils)
load_image_byte() (in module data_juicer.utils.mm_utils)
load_images() (in module data_juicer.utils.mm_utils)
load_images_byte() (in module data_juicer.utils.mm_utils)
load_ops() (in module data_juicer.ops)
(in module data_juicer.ops.load)
load_ops_with_stats_meta() (in module data_juicer.config.config)
load_video() (in module data_juicer.utils.mm_utils)
load_videos() (in module data_juicer.utils.mm_utils)
load_words_asset() (in module data_juicer.utils.asset_utils)
loaded_audios (data_juicer.utils.constant.InterVars attribute)
loaded_images (data_juicer.utils.constant.InterVars attribute)
loaded_videos (data_juicer.utils.constant.InterVars attribute)
LocalFormatter (class in data_juicer.format)
(class in data_juicer.format.formatter)
Lz4Compressor (class in data_juicer.utils.compress)
M
main_entities (data_juicer.utils.constant.Fields attribute)
map() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.data.NestedDatasetDict method)
(data_juicer.core.NestedDataset method)
Mapper (class in data_juicer.ops)
(class in data_juicer.ops.base_op)
MAX_BATCH_SIZE (data_juicer.core.Adapter attribute)
(data_juicer.core.adapter.Adapter attribute)
max_line_length (data_juicer.utils.constant.StatsKeysConstant attribute)
MaximumLineLengthFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.maximum_line_length_filter)
Measure (class in data_juicer.analysis.measure)
measure() (data_juicer.analysis.measure.CrossEntropyMeasure method)
(data_juicer.analysis.measure.EntropyMeasure method)
(data_juicer.analysis.measure.JSDivMeasure method)
(data_juicer.analysis.measure.KLDivMeasure method)
(data_juicer.analysis.measure.Measure method)
(data_juicer.analysis.measure.RelatedTTestMeasure method)
merge_config() (in module data_juicer.config)
(in module data_juicer.config.config)
merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
meta (data_juicer.utils.constant.Fields attribute)
meta_name (data_juicer.utils.constant.JobRequiredKeys attribute)
MiB (data_juicer.core.Exporter attribute)
(data_juicer.core.exporter.Exporter attribute)
minhash (data_juicer.utils.constant.HashKeys attribute)
MixtureFormatter (class in data_juicer.format)
(class in data_juicer.format.mixture_formatter)
module
data_juicer
data_juicer.analysis
data_juicer.analysis.collector
data_juicer.analysis.column_wise_analysis
data_juicer.analysis.diversity_analysis
data_juicer.analysis.draw
data_juicer.analysis.measure
data_juicer.analysis.overall_analysis
data_juicer.config
data_juicer.config.config
data_juicer.core
data_juicer.core.adapter
data_juicer.core.analyzer
data_juicer.core.data
data_juicer.core.executor
data_juicer.core.exporter
data_juicer.core.monitor
data_juicer.core.ray_data
data_juicer.core.ray_executor
data_juicer.core.tracer
data_juicer.format
data_juicer.format.csv_formatter
data_juicer.format.empty_formatter
data_juicer.format.formatter
data_juicer.format.json_formatter
data_juicer.format.load
data_juicer.format.mixture_formatter
data_juicer.format.parquet_formatter
data_juicer.format.text_formatter
data_juicer.format.tsv_formatter
data_juicer.ops
data_juicer.ops.aggregator
data_juicer.ops.aggregator.entity_attribute_aggregator
data_juicer.ops.aggregator.most_relavant_entities_aggregator
data_juicer.ops.aggregator.nested_aggregator
data_juicer.ops.base_op
data_juicer.ops.common
data_juicer.ops.common.helper_func
data_juicer.ops.common.special_characters
data_juicer.ops.deduplicator
data_juicer.ops.deduplicator.document_deduplicator
data_juicer.ops.deduplicator.document_minhash_deduplicator
data_juicer.ops.deduplicator.document_simhash_deduplicator
data_juicer.ops.deduplicator.image_deduplicator
data_juicer.ops.deduplicator.ray_basic_deduplicator
data_juicer.ops.deduplicator.ray_document_deduplicator
data_juicer.ops.deduplicator.ray_image_deduplicator
data_juicer.ops.deduplicator.ray_video_deduplicator
data_juicer.ops.deduplicator.video_deduplicator
data_juicer.ops.filter
data_juicer.ops.filter.alphanumeric_filter
data_juicer.ops.filter.audio_duration_filter
data_juicer.ops.filter.audio_nmf_snr_filter
data_juicer.ops.filter.audio_size_filter
data_juicer.ops.filter.average_line_length_filter
data_juicer.ops.filter.character_repetition_filter
data_juicer.ops.filter.flagged_words_filter
data_juicer.ops.filter.image_aesthetics_filter
data_juicer.ops.filter.image_aspect_ratio_filter
data_juicer.ops.filter.image_face_count_filter
data_juicer.ops.filter.image_face_ratio_filter
data_juicer.ops.filter.image_nsfw_filter
data_juicer.ops.filter.image_pair_similarity_filter
data_juicer.ops.filter.image_shape_filter
data_juicer.ops.filter.image_size_filter
data_juicer.ops.filter.image_text_matching_filter
data_juicer.ops.filter.image_text_similarity_filter
data_juicer.ops.filter.image_watermark_filter
data_juicer.ops.filter.language_id_score_filter
data_juicer.ops.filter.maximum_line_length_filter
data_juicer.ops.filter.perplexity_filter
data_juicer.ops.filter.phrase_grounding_recall_filter
data_juicer.ops.filter.special_characters_filter
data_juicer.ops.filter.specified_field_filter
data_juicer.ops.filter.specified_numeric_field_filter
data_juicer.ops.filter.stopwords_filter
data_juicer.ops.filter.suffix_filter
data_juicer.ops.filter.text_action_filter
data_juicer.ops.filter.text_entity_dependency_filter
data_juicer.ops.filter.text_length_filter
data_juicer.ops.filter.token_num_filter
data_juicer.ops.filter.video_aesthetics_filter
data_juicer.ops.filter.video_aspect_ratio_filter
data_juicer.ops.filter.video_duration_filter
data_juicer.ops.filter.video_frames_text_similarity_filter
data_juicer.ops.filter.video_motion_score_filter
data_juicer.ops.filter.video_motion_score_raft_filter
data_juicer.ops.filter.video_nsfw_filter
data_juicer.ops.filter.video_ocr_area_ratio_filter
data_juicer.ops.filter.video_resolution_filter
data_juicer.ops.filter.video_tagging_from_frames_filter
data_juicer.ops.filter.video_watermark_filter
data_juicer.ops.filter.word_repetition_filter
data_juicer.ops.filter.words_num_filter
data_juicer.ops.grouper
data_juicer.ops.grouper.key_value_grouper
data_juicer.ops.grouper.naive_grouper
data_juicer.ops.load
data_juicer.ops.mapper
data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper
data_juicer.ops.mapper.calibrate_qa_mapper
data_juicer.ops.mapper.calibrate_query_mapper
data_juicer.ops.mapper.calibrate_response_mapper
data_juicer.ops.mapper.chinese_convert_mapper
data_juicer.ops.mapper.clean_copyright_mapper
data_juicer.ops.mapper.clean_email_mapper
data_juicer.ops.mapper.clean_html_mapper
data_juicer.ops.mapper.clean_ip_mapper
data_juicer.ops.mapper.clean_links_mapper
data_juicer.ops.mapper.expand_macro_mapper
data_juicer.ops.mapper.extract_entity_attribute_mapper
data_juicer.ops.mapper.extract_entity_relation_mapper
data_juicer.ops.mapper.extract_event_mapper
data_juicer.ops.mapper.extract_keyword_mapper
data_juicer.ops.mapper.extract_nickname_mapper
data_juicer.ops.mapper.extract_support_text_mapper
data_juicer.ops.mapper.fix_unicode_mapper
data_juicer.ops.mapper.generate_qa_from_examples_mapper
data_juicer.ops.mapper.generate_qa_from_text_mapper
data_juicer.ops.mapper.image_blur_mapper
data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper
data_juicer.ops.mapper.image_captioning_mapper
data_juicer.ops.mapper.image_diffusion_mapper
data_juicer.ops.mapper.image_face_blur_mapper
data_juicer.ops.mapper.image_tagging_mapper
data_juicer.ops.mapper.nlpaug_en_mapper
data_juicer.ops.mapper.nlpcda_zh_mapper
data_juicer.ops.mapper.optimize_qa_mapper
data_juicer.ops.mapper.optimize_query_mapper
data_juicer.ops.mapper.optimize_response_mapper
data_juicer.ops.mapper.pair_preference_mapper
data_juicer.ops.mapper.punctuation_normalization_mapper
data_juicer.ops.mapper.python_file_mapper
data_juicer.ops.mapper.python_lambda_mapper
data_juicer.ops.mapper.relation_identity_mapper
data_juicer.ops.mapper.remove_bibliography_mapper
data_juicer.ops.mapper.remove_comments_mapper
data_juicer.ops.mapper.remove_header_mapper
data_juicer.ops.mapper.remove_long_words_mapper
data_juicer.ops.mapper.remove_non_chinese_character_mapper
data_juicer.ops.mapper.remove_repeat_sentences_mapper
data_juicer.ops.mapper.remove_specific_chars_mapper
data_juicer.ops.mapper.remove_table_text_mapper
data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper
data_juicer.ops.mapper.replace_content_mapper
data_juicer.ops.mapper.sentence_split_mapper
data_juicer.ops.mapper.text_chunk_mapper
data_juicer.ops.mapper.video_captioning_from_audio_mapper
data_juicer.ops.mapper.video_captioning_from_frames_mapper
data_juicer.ops.mapper.video_captioning_from_summarizer_mapper
data_juicer.ops.mapper.video_captioning_from_video_mapper
data_juicer.ops.mapper.video_extract_frames_mapper
data_juicer.ops.mapper.video_face_blur_mapper
data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper
data_juicer.ops.mapper.video_remove_watermark_mapper
data_juicer.ops.mapper.video_resize_aspect_ratio_mapper
data_juicer.ops.mapper.video_resize_resolution_mapper
data_juicer.ops.mapper.video_split_by_duration_mapper
data_juicer.ops.mapper.video_split_by_key_frame_mapper
data_juicer.ops.mapper.video_split_by_scene_mapper
data_juicer.ops.mapper.video_tagging_from_audio_mapper
data_juicer.ops.mapper.video_tagging_from_frames_mapper
data_juicer.ops.mapper.whitespace_normalization_mapper
data_juicer.ops.op_fusion
data_juicer.ops.selector
data_juicer.ops.selector.frequency_specified_field_selector
data_juicer.ops.selector.random_selector
data_juicer.ops.selector.range_specified_field_selector
data_juicer.ops.selector.topk_specified_field_selector
data_juicer.tools
data_juicer.utils
data_juicer.utils.asset_utils
data_juicer.utils.auto_install_mapping
data_juicer.utils.auto_install_utils
data_juicer.utils.availability_utils
data_juicer.utils.cache_utils
data_juicer.utils.ckpt_utils
data_juicer.utils.common_utils
data_juicer.utils.compress
data_juicer.utils.constant
data_juicer.utils.file_utils
data_juicer.utils.fingerprint_utils
data_juicer.utils.lazy_loader
data_juicer.utils.logger_utils
data_juicer.utils.mm_utils
data_juicer.utils.model_utils
data_juicer.utils.process_utils
data_juicer.utils.registry
data_juicer.utils.resource_utils
data_juicer.utils.unittest_utils
modules (data_juicer.utils.registry.Registry property)
Monitor (class in data_juicer.core)
(class in data_juicer.core.monitor)
monitor_all_resources() (data_juicer.core.Monitor method)
(data_juicer.core.monitor.Monitor method)
monitor_current_resources() (data_juicer.core.Monitor static method)
(data_juicer.core.monitor.Monitor static method)
monitor_func() (data_juicer.core.Monitor static method)
(data_juicer.core.monitor.Monitor static method)
MostRelavantEntitiesAggregator (class in data_juicer.ops.aggregator)
(class in data_juicer.ops.aggregator.most_relavant_entities_aggregator)
multimodal_data_output_dir (data_juicer.utils.constant.Fields attribute)
N
NaiveGrouper (class in data_juicer.ops.grouper)
(class in data_juicer.ops.grouper.naive_grouper)
name (data_juicer.analysis.measure.CrossEntropyMeasure attribute)
(data_juicer.analysis.measure.EntropyMeasure attribute)
(data_juicer.analysis.measure.JSDivMeasure attribute)
(data_juicer.analysis.measure.KLDivMeasure attribute)
(data_juicer.analysis.measure.Measure attribute)
(data_juicer.analysis.measure.RelatedTTestMeasure attribute)
(data_juicer.utils.registry.Registry property)
namespace_to_arg_list() (in module data_juicer.config.config)
nested_access() (in module data_juicer.utils.common_utils)
nested_obj_factory() (in module data_juicer.core.data)
nested_query() (in module data_juicer.core.data)
nested_set() (in module data_juicer.utils.common_utils)
NestedAggregator (class in data_juicer.ops.aggregator)
(class in data_juicer.ops.aggregator.nested_aggregator)
NestedDataset (class in data_juicer.core)
(class in data_juicer.core.data)
NestedDatasetDict (class in data_juicer.core.data)
NestedQueryDict (class in data_juicer.core.data)
nickname (data_juicer.utils.constant.Fields attribute)
NlpaugEnMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.nlpaug_en_mapper)
NlpcdaZhMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.nlpcda_zh_mapper)
null_value (data_juicer.format.empty_formatter.EmptyFormatter property)
(data_juicer.format.empty_formatter.RayEmptyFormatter property)
(data_juicer.format.EmptyFormatter property)
(data_juicer.format.RayEmptyFormatter property)
num_action (data_juicer.utils.constant.StatsKeysConstant attribute)
num_dependency_edges (data_juicer.utils.constant.StatsKeysConstant attribute)
num_token (data_juicer.utils.constant.StatsKeysConstant attribute)
num_words (data_juicer.utils.constant.StatsKeysConstant attribute)
O
OP (class in data_juicer.ops.base_op)
optimal_param() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator)
OptimizeQAMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.optimize_qa_mapper)
OptimizeQueryMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.optimize_query_mapper)
OptimizeResponseMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.optimize_response_mapper)
OverallAnalysis (class in data_juicer.analysis)
(class in data_juicer.analysis.overall_analysis)
P
PairPreferenceMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.pair_preference_mapper)
ParquetFormatter (class in data_juicer.format)
(class in data_juicer.format.parquet_formatter)
parse_output() (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.most_relavant_entities_aggregator.MostRelavantEntitiesAggregator method)
(data_juicer.ops.aggregator.MostRelavantEntitiesAggregator method)
(data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method)
(data_juicer.ops.aggregator.NestedAggregator method)
(data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.calibrate_query_mapper.CalibrateQueryMapper method)
(data_juicer.ops.mapper.calibrate_response_mapper.CalibrateResponseMapper method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.CalibrateQueryMapper method)
(data_juicer.ops.mapper.CalibrateResponseMapper method)
(data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.generate_qa_from_text_mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.optimize_query_mapper.OptimizeQueryMapper method)
(data_juicer.ops.mapper.optimize_response_mapper.OptimizeResponseMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.OptimizeQueryMapper method)
(data_juicer.ops.mapper.OptimizeResponseMapper method)
(data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper method)
(data_juicer.ops.mapper.RelationIdentityMapper method)
parse_string_to_roi() (in module data_juicer.utils.mm_utils)
perplexity (data_juicer.utils.constant.StatsKeysConstant attribute)
PerplexityFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.perplexity_filter)
phrase_grounding_recall (data_juicer.utils.constant.StatsKeysConstant attribute)
PhraseGroundingRecallFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.phrase_grounding_recall_filter)
pil_to_opencv() (in module data_juicer.utils.mm_utils)
prepare_api_model() (in module data_juicer.utils.model_utils)
prepare_converter() (in module data_juicer.ops.mapper.chinese_convert_mapper)
prepare_diffusion_model() (in module data_juicer.utils.model_utils)
prepare_fasttext_model() (in module data_juicer.utils.model_utils)
prepare_huggingface_model() (in module data_juicer.utils.model_utils)
prepare_kenlm_model() (in module data_juicer.utils.model_utils)
prepare_model() (in module data_juicer.utils.model_utils)
prepare_nltk_model() (in module data_juicer.utils.model_utils)
prepare_opencv_classifier() (in module data_juicer.utils.model_utils)
prepare_recognizeAnything_model() (in module data_juicer.utils.model_utils)
prepare_sentencepiece_for_lang() (in module data_juicer.utils.model_utils)
prepare_sentencepiece_model() (in module data_juicer.utils.model_utils)
prepare_side_configs() (in module data_juicer.config)
(in module data_juicer.config.config)
prepare_simple_aesthetics_model() (in module data_juicer.utils.model_utils)
prepare_spacy_model() (in module data_juicer.utils.model_utils)
prepare_video_blip_model() (in module data_juicer.utils.model_utils)
prepare_vllm_model() (in module data_juicer.utils.model_utils)
preprocess_dataset() (in module data_juicer.core.ray_data)
probe_small_batch() (data_juicer.core.Adapter method)
(data_juicer.core.adapter.Adapter method)
process() (data_juicer.core.data.DJDataset method)
(data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
(data_juicer.core.ray_data.RayDataset method)
(data_juicer.ops.base_op.Deduplicator method)
(data_juicer.ops.base_op.Grouper method)
(data_juicer.ops.base_op.OP method)
(data_juicer.ops.base_op.Selector method)
(data_juicer.ops.Deduplicator method)
(data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method)
(data_juicer.ops.deduplicator.VideoDeduplicator method)
(data_juicer.ops.Grouper method)
(data_juicer.ops.grouper.key_value_grouper.KeyValueGrouper method)
(data_juicer.ops.grouper.KeyValueGrouper method)
(data_juicer.ops.grouper.naive_grouper.NaiveGrouper method)
(data_juicer.ops.grouper.NaiveGrouper method)
(data_juicer.ops.Selector method)
(data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector method)
(data_juicer.ops.selector.FrequencySpecifiedFieldSelector method)
(data_juicer.ops.selector.random_selector.RandomSelector method)
(data_juicer.ops.selector.RandomSelector method)
(data_juicer.ops.selector.range_specified_field_selector.RangeSpecifiedFieldSelector method)
(data_juicer.ops.selector.RangeSpecifiedFieldSelector method)
(data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector method)
(data_juicer.ops.selector.TopkSpecifiedFieldSelector method)
process_batched() (data_juicer.ops.base_op.Filter method)
(data_juicer.ops.base_op.Mapper method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method)
(data_juicer.ops.filter.AlphanumericFilter method)
(data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter method)
(data_juicer.ops.filter.FlaggedWordFilter method)
(data_juicer.ops.filter.image_aspect_ratio_filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.perplexity_filter.PerplexityFilter method)
(data_juicer.ops.filter.PerplexityFilter method)
(data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.text_length_filter.TextLengthFilter method)
(data_juicer.ops.filter.TextLengthFilter method)
(data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter method)
(data_juicer.ops.filter.WordRepetitionFilter method)
(data_juicer.ops.filter.words_num_filter.WordsNumFilter method)
(data_juicer.ops.filter.WordsNumFilter method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.mapper.chinese_convert_mapper.ChineseConvertMapper method)
(data_juicer.ops.mapper.ChineseConvertMapper method)
(data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper method)
(data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper method)
(data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper method)
(data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper method)
(data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper method)
(data_juicer.ops.mapper.CleanCopyrightMapper method)
(data_juicer.ops.mapper.CleanEmailMapper method)
(data_juicer.ops.mapper.CleanHtmlMapper method)
(data_juicer.ops.mapper.CleanIpMapper method)
(data_juicer.ops.mapper.CleanLinksMapper method)
(data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper method)
(data_juicer.ops.mapper.ExpandMacroMapper method)
(data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper method)
(data_juicer.ops.mapper.FixUnicodeMapper method)
(data_juicer.ops.mapper.generate_qa_from_text_mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper.ImageCaptioningFromGPT4VMapper method)
(data_juicer.ops.mapper.image_captioning_mapper.ImageCaptioningMapper method)
(data_juicer.ops.mapper.image_diffusion_mapper.ImageDiffusionMapper method)
(data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method)
(data_juicer.ops.mapper.ImageCaptioningMapper method)
(data_juicer.ops.mapper.ImageDiffusionMapper method)
(data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper method)
(data_juicer.ops.mapper.NlpaugEnMapper method)
(data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper method)
(data_juicer.ops.mapper.NlpcdaZhMapper method)
(data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper method)
(data_juicer.ops.mapper.PunctuationNormalizationMapper method)
(data_juicer.ops.mapper.python_file_mapper.PythonFileMapper method)
(data_juicer.ops.mapper.python_lambda_mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.PythonFileMapper method)
(data_juicer.ops.mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper method)
(data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper method)
(data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper method)
(data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper method)
(data_juicer.ops.mapper.remove_non_chinese_character_mapper.RemoveNonChineseCharacterlMapper method)
(data_juicer.ops.mapper.remove_repeat_sentences_mapper.RemoveRepeatSentencesMapper method)
(data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper method)
(data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper method)
(data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper method)
(data_juicer.ops.mapper.RemoveBibliographyMapper method)
(data_juicer.ops.mapper.RemoveCommentsMapper method)
(data_juicer.ops.mapper.RemoveHeaderMapper method)
(data_juicer.ops.mapper.RemoveLongWordsMapper method)
(data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper method)
(data_juicer.ops.mapper.RemoveRepeatSentencesMapper method)
(data_juicer.ops.mapper.RemoveSpecificCharsMapper method)
(data_juicer.ops.mapper.RemoveTableTextMapper method)
(data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method)
(data_juicer.ops.mapper.replace_content_mapper.ReplaceContentMapper method)
(data_juicer.ops.mapper.ReplaceContentMapper method)
(data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper method)
(data_juicer.ops.mapper.SentenceSplitMapper method)
(data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method)
(data_juicer.ops.mapper.TextChunkMapper method)
(data_juicer.ops.mapper.video_captioning_from_audio_mapper.VideoCaptioningFromAudioMapper method)
(data_juicer.ops.mapper.video_captioning_from_frames_mapper.VideoCaptioningFromFramesMapper method)
(data_juicer.ops.mapper.video_captioning_from_summarizer_mapper.VideoCaptioningFromSummarizerMapper method)
(data_juicer.ops.mapper.video_captioning_from_video_mapper.VideoCaptioningFromVideoMapper method)
(data_juicer.ops.mapper.video_split_by_duration_mapper.VideoSplitByDurationMapper method)
(data_juicer.ops.mapper.video_split_by_key_frame_mapper.VideoSplitByKeyFrameMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromAudioMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromFramesMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromVideoMapper method)
(data_juicer.ops.mapper.VideoSplitByDurationMapper method)
(data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
(data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper method)
(data_juicer.ops.mapper.WhitespaceNormalizationMapper method)
(data_juicer.ops.op_fusion.FusedFilter method)
process_each_frame() (in module data_juicer.utils.mm_utils)
process_single() (data_juicer.ops.Aggregator method)
(data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.most_relavant_entities_aggregator.MostRelavantEntitiesAggregator method)
(data_juicer.ops.aggregator.MostRelavantEntitiesAggregator method)
(data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method)
(data_juicer.ops.aggregator.NestedAggregator method)
(data_juicer.ops.base_op.Aggregator method)
(data_juicer.ops.base_op.Filter method)
(data_juicer.ops.base_op.Mapper method)
(data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method)
(data_juicer.ops.filter.audio_nmf_snr_filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.audio_size_filter.AudioSizeFilter method)
(data_juicer.ops.filter.AudioDurationFilter method)
(data_juicer.ops.filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.AudioSizeFilter method)
(data_juicer.ops.filter.image_aesthetics_filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.image_face_count_filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.image_face_ratio_filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.image_nsfw_filter.ImageNSFWFilter method)
(data_juicer.ops.filter.image_pair_similarity_filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.image_shape_filter.ImageShapeFilter method)
(data_juicer.ops.filter.image_size_filter.ImageSizeFilter method)
(data_juicer.ops.filter.image_text_matching_filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.image_text_similarity_filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.image_watermark_filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.ImageNSFWFilter method)
(data_juicer.ops.filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.ImageShapeFilter method)
(data_juicer.ops.filter.ImageSizeFilter method)
(data_juicer.ops.filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.phrase_grounding_recall_filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.stopwords_filter.StopWordsFilter method)
(data_juicer.ops.filter.StopWordsFilter method)
(data_juicer.ops.filter.suffix_filter.SuffixFilter method)
(data_juicer.ops.filter.SuffixFilter method)
(data_juicer.ops.filter.text_action_filter.TextActionFilter method)
(data_juicer.ops.filter.text_entity_dependency_filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.TextActionFilter method)
(data_juicer.ops.filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.token_num_filter.TokenNumFilter method)
(data_juicer.ops.filter.TokenNumFilter method)
(data_juicer.ops.filter.video_aesthetics_filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.video_aspect_ratio_filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.video_duration_filter.VideoDurationFilter method)
(data_juicer.ops.filter.video_frames_text_similarity_filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.video_nsfw_filter.VideoNSFWFilter method)
(data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.video_resolution_filter.VideoResolutionFilter method)
(data_juicer.ops.filter.video_tagging_from_frames_filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.video_watermark_filter.VideoWatermarkFilter method)
(data_juicer.ops.filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.VideoDurationFilter method)
(data_juicer.ops.filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoNSFWFilter method)
(data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.VideoResolutionFilter method)
(data_juicer.ops.filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.VideoWatermarkFilter method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper.AudioFFmpegWrappedMapper method)
(data_juicer.ops.mapper.AudioFFmpegWrappedMapper method)
(data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper method)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.ExtractSupportTextMapper method)
(data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.image_blur_mapper.ImageBlurMapper method)
(data_juicer.ops.mapper.image_face_blur_mapper.ImageFaceBlurMapper method)
(data_juicer.ops.mapper.image_tagging_mapper.ImageTaggingMapper method)
(data_juicer.ops.mapper.ImageBlurMapper method)
(data_juicer.ops.mapper.ImageFaceBlurMapper method)
(data_juicer.ops.mapper.ImageTaggingMapper method)
(data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.python_file_mapper.PythonFileMapper method)
(data_juicer.ops.mapper.python_lambda_mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.PythonFileMapper method)
(data_juicer.ops.mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper method)
(data_juicer.ops.mapper.RelationIdentityMapper method)
(data_juicer.ops.mapper.video_extract_frames_mapper.VideoExtractFramesMapper method)
(data_juicer.ops.mapper.video_face_blur_mapper.VideoFaceBlurMapper method)
(data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper.VideoFFmpegWrappedMapper method)
(data_juicer.ops.mapper.video_remove_watermark_mapper.VideoRemoveWatermarkMapper method)
(data_juicer.ops.mapper.video_resize_aspect_ratio_mapper.VideoResizeAspectRatioMapper method)
(data_juicer.ops.mapper.video_resize_resolution_mapper.VideoResizeResolutionMapper method)
(data_juicer.ops.mapper.video_split_by_scene_mapper.VideoSplitBySceneMapper method)
(data_juicer.ops.mapper.video_tagging_from_audio_mapper.VideoTaggingFromAudioMapper method)
(data_juicer.ops.mapper.video_tagging_from_frames_mapper.VideoTaggingFromFramesMapper method)
(data_juicer.ops.mapper.VideoExtractFramesMapper method)
(data_juicer.ops.mapper.VideoFaceBlurMapper method)
(data_juicer.ops.mapper.VideoFFmpegWrappedMapper method)
(data_juicer.ops.mapper.VideoRemoveWatermarkMapper method)
(data_juicer.ops.mapper.VideoResizeAspectRatioMapper method)
(data_juicer.ops.mapper.VideoResizeResolutionMapper method)
(data_juicer.ops.mapper.VideoSplitBySceneMapper method)
(data_juicer.ops.mapper.VideoTaggingFromAudioMapper method)
(data_juicer.ops.mapper.VideoTaggingFromFramesMapper method)
PunctuationNormalizationMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.punctuation_normalization_mapper)
PythonFileMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.python_file_mapper)
PythonLambdaMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.python_lambda_mapper)
Q
query_cuda_info() (in module data_juicer.utils.resource_utils)
query_mem_info() (in module data_juicer.utils.resource_utils)
query_most_relavant_entities() (data_juicer.ops.aggregator.most_relavant_entities_aggregator.MostRelavantEntitiesAggregator method)
(data_juicer.ops.aggregator.MostRelavantEntitiesAggregator method)
R
random_sample() (data_juicer.format.mixture_formatter.MixtureFormatter class method)
(data_juicer.format.MixtureFormatter class method)
RandomSelector (class in data_juicer.ops.selector)
(class in data_juicer.ops.selector.random_selector)
RangeSpecifiedFieldSelector (class in data_juicer.ops.selector)
(class in data_juicer.ops.selector.range_specified_field_selector)
RayBasicDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.ray_basic_deduplicator)
RayDataset (class in data_juicer.core.ray_data)
RayDocumentDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.ray_document_deduplicator)
RayEmptyFormatter (class in data_juicer.format)
(class in data_juicer.format.empty_formatter)
RayExecutor (class in data_juicer.core.ray_executor)
RayImageDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.ray_image_deduplicator)
RayVideoDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.ray_video_deduplicator)
read_json() (data_juicer.core.ray_data.RayDataset class method)
read_json_stream() (in module data_juicer.core.ray_data)
record() (data_juicer.utils.ckpt_utils.CheckpointManager method)
recursive_summary() (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method)
(data_juicer.ops.aggregator.NestedAggregator method)
recursively_chunk() (data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method)
(data_juicer.ops.mapper.TextChunkMapper method)
redirect_sys_output() (in module data_juicer.utils.logger_utils)
refine_single_column() (data_juicer.analysis.overall_analysis.OverallAnalysis method)
(data_juicer.analysis.OverallAnalysis method)
refined_words (data_juicer.utils.constant.InterVars attribute)
register_module() (data_juicer.utils.registry.Registry method)
Registry (class in data_juicer.utils.registry)
RelatedTTestMeasure (class in data_juicer.analysis.measure)
relation (data_juicer.utils.constant.Fields attribute)
relation_description (data_juicer.utils.constant.Fields attribute)
relation_keywords (data_juicer.utils.constant.Fields attribute)
relation_strength (data_juicer.utils.constant.Fields attribute)
RelationIdentityMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.relation_identity_mapper)
relevant_characters (data_juicer.utils.constant.Fields attribute)
RemoteFormatter (class in data_juicer.format)
(class in data_juicer.format.formatter)
remove_columns() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
remove_extra_parameters() (data_juicer.ops.base_op.OP method)
remove_non_special_tokens() (in module data_juicer.utils.mm_utils)
remove_punctuation() (in module data_juicer.ops.filter.phrase_grounding_recall_filter)
remove_special_tokens() (in module data_juicer.utils.mm_utils)
RemoveBibliographyMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_bibliography_mapper)
RemoveCommentsMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_comments_mapper)
RemoveHeaderMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_header_mapper)
RemoveLongWordsMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_long_words_mapper)
RemoveNonChineseCharacterlMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_non_chinese_character_mapper)
RemoveRepeatSentencesMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_repeat_sentences_mapper)
RemoveSpecificCharsMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_specific_chars_mapper)
RemoveTableTextMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_table_text_mapper)
RemoveWordsWithIncorrectSubstringsMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper)
replace_func() (in module data_juicer.ops.mapper.video_split_by_scene_mapper)
ReplaceContentMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.replace_content_mapper)
rescale() (in module data_juicer.ops.mapper.video_resize_aspect_ratio_mapper)
resource_monitor() (in module data_juicer.core.monitor)
run() (data_juicer.core.Analyzer method)
(data_juicer.core.analyzer.Analyzer method)
(data_juicer.core.Executor method)
(data_juicer.core.executor.Executor method)
(data_juicer.core.ray_executor.RayExecutor method)
(data_juicer.ops.Aggregator method)
(data_juicer.ops.base_op.Aggregator method)
(data_juicer.ops.base_op.Deduplicator method)
(data_juicer.ops.base_op.Filter method)
(data_juicer.ops.base_op.Grouper method)
(data_juicer.ops.base_op.Mapper method)
(data_juicer.ops.base_op.OP method)
(data_juicer.ops.base_op.Selector method)
(data_juicer.ops.Deduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.Grouper method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.Selector method)
run_ner() (in module data_juicer.ops.filter.phrase_grounding_recall_filter)
run_single_op() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase method)
runtime_np() (data_juicer.ops.base_op.OP method)
S
sample_data() (data_juicer.core.Executor method)
(data_juicer.core.executor.Executor method)
sampled_frames (data_juicer.utils.constant.InterVars attribute)
save_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method)
select() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
select_columns() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
Selector (class in data_juicer.ops)
(class in data_juicer.ops.base_op)
SentenceSplitMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.sentence_split_mapper)
separate_signal_noise() (in module data_juicer.ops.filter.audio_nmf_snr_filter)
set_clear_model_flag() (in module data_juicer.utils.unittest_utils)
set_dataset_to_absolute_path() (in module data_juicer.core.ray_data)
setup_logger() (in module data_juicer.utils.logger_utils)
setup_model() (data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.video_motion_score_raft_filter.VideoMotionScoreRaftFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoMotionScoreRaftFilter method)
setup_mp() (in module data_juicer.utils.process_utils)
setUpClass() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase class method)
sha1_hash32() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator)
should_keep_long_word() (data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper method)
(data_juicer.ops.mapper.RemoveLongWordsMapper method)
should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper method)
(data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method)
simhash (data_juicer.utils.constant.HashKeys attribute)
size_to_bytes() (in module data_juicer.utils.mm_utils)
sort_op_by_types_and_names() (in module data_juicer.config.config)
source_entity (data_juicer.utils.constant.Fields attribute)
source_file (data_juicer.utils.constant.Fields attribute)
special_char_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
SpecialCharactersFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.special_characters_filter)
SpecialTokens (class in data_juicer.utils.mm_utils)
SpecifiedFieldFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.specified_field_filter)
SpecifiedNumericFieldFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.specified_numeric_field_filter)
split_on_newline_tab_whitespace() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
split_on_whitespace() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
split_sentence() (in module data_juicer.ops.mapper.remove_repeat_sentences_mapper)
split_text_by_punctuation() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
split_videos_by_duration() (data_juicer.ops.mapper.video_split_by_duration_mapper.VideoSplitByDurationMapper method)
(data_juicer.ops.mapper.VideoSplitByDurationMapper method)
stats (data_juicer.utils.constant.Fields attribute)
stats_to_hist() (data_juicer.analysis.measure.RelatedTTestMeasure static method)
stats_to_number() (in module data_juicer.utils.common_utils)
StatsKeys (class in data_juicer.utils.constant)
StatsKeysConstant (class in data_juicer.utils.constant)
StatsKeysMeta (class in data_juicer.utils.constant)
stopwords_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
StopWordsFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.stopwords_filter)
STRATEGY (data_juicer.ops.mapper.video_resize_aspect_ratio_mapper.VideoResizeAspectRatioMapper attribute)
(data_juicer.ops.mapper.VideoResizeAspectRatioMapper attribute)
StreamToLoguru (class in data_juicer.utils.logger_utils)
strip() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
suffix (data_juicer.utils.constant.Fields attribute)
SUFFIXES (data_juicer.format.csv_formatter.CsvFormatter attribute)
(data_juicer.format.CsvFormatter attribute)
(data_juicer.format.empty_formatter.EmptyFormatter attribute)
(data_juicer.format.empty_formatter.RayEmptyFormatter attribute)
(data_juicer.format.EmptyFormatter attribute)
(data_juicer.format.json_formatter.JsonFormatter attribute)
(data_juicer.format.JsonFormatter attribute)
(data_juicer.format.parquet_formatter.ParquetFormatter attribute)
(data_juicer.format.ParquetFormatter attribute)
(data_juicer.format.RayEmptyFormatter attribute)
(data_juicer.format.text_formatter.TextFormatter attribute)
(data_juicer.format.TextFormatter attribute)
(data_juicer.format.tsv_formatter.TsvFormatter attribute)
(data_juicer.format.TsvFormatter attribute)
SuffixFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.suffix_filter)
support_text (data_juicer.utils.constant.Fields attribute)
T
take_batch() (data_juicer.core.Adapter static method)
(data_juicer.core.adapter.Adapter static method)
target_entity (data_juicer.utils.constant.Fields attribute)
tearDown() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase class method)
tearDownClass() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase class method)
TEST_TAG() (in module data_juicer.utils.unittest_utils)
text_len (data_juicer.utils.constant.StatsKeysConstant attribute)
TextActionFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.text_action_filter)
TextChunkMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.text_chunk_mapper)
TextEntityDependencyFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.text_entity_dependency_filter)
TextFormatter (class in data_juicer.format)
(class in data_juicer.format.text_formatter)
TextLengthFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.text_length_filter)
TextTokenDistCollector (class in data_juicer.analysis.collector)
TiB (data_juicer.core.Exporter attribute)
(data_juicer.core.exporter.Exporter attribute)
timecode_string_to_seconds() (in module data_juicer.utils.mm_utils)
to_json() (data_juicer.core.Exporter static method)
(data_juicer.core.exporter.Exporter static method)
to_jsonl() (data_juicer.core.Exporter static method)
(data_juicer.core.exporter.Exporter static method)
to_parquet() (data_juicer.core.Exporter static method)
(data_juicer.core.exporter.Exporter static method)
TokenNumFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.token_num_filter)
TopkSpecifiedFieldSelector (class in data_juicer.ops.selector)
(class in data_juicer.ops.selector.topk_specified_field_selector)
trace_batch_mapper() (data_juicer.core.Tracer method)
(data_juicer.core.tracer.Tracer method)
trace_deduplicator() (data_juicer.core.Tracer method)
(data_juicer.core.tracer.Tracer method)
trace_filter() (data_juicer.core.Tracer method)
(data_juicer.core.tracer.Tracer method)
trace_mapper() (data_juicer.core.Tracer method)
(data_juicer.core.tracer.Tracer method)
Tracer (class in data_juicer.core)
(class in data_juicer.core.tracer)
transfer_filename() (in module data_juicer.utils.file_utils)
triangle_area() (in module data_juicer.ops.filter.video_ocr_area_ratio_filter)
TsvFormatter (class in data_juicer.format)
(class in data_juicer.format.tsv_formatter)
U
unify_format() (in module data_juicer.format.formatter)
union() (data_juicer.ops.common.helper_func.UnionFind method)
UnionFind (class in data_juicer.ops.common.helper_func)
update() (data_juicer.utils.fingerprint_utils.Hasher method)
update_args() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
update_ds_cache_dir_and_related_vars() (in module data_juicer.config.config)
update_fingerprint() (in module data_juicer.utils.fingerprint_utils)
update_op_attr() (in module data_juicer.config.config)
update_op_process() (in module data_juicer.config.config)
use_cuda() (data_juicer.ops.base_op.OP method)
V
video (data_juicer.utils.mm_utils.SpecialTokens attribute)
video_aesthetic_score (data_juicer.utils.constant.StatsKeysConstant attribute)
video_aspect_ratios (data_juicer.utils.constant.StatsKeysConstant attribute)
video_audio_tags (data_juicer.utils.constant.Fields attribute)
video_duration (data_juicer.utils.constant.StatsKeysConstant attribute)
video_frame_tags (data_juicer.utils.constant.Fields attribute)
video_frames (data_juicer.utils.constant.Fields attribute)
video_frames_aesthetics_score (data_juicer.utils.constant.StatsKeysConstant attribute)
video_frames_text_similarity (data_juicer.utils.constant.StatsKeysConstant attribute)
video_height (data_juicer.utils.constant.StatsKeysConstant attribute)
video_motion_score (data_juicer.utils.constant.StatsKeysConstant attribute)
video_nsfw_score (data_juicer.utils.constant.StatsKeysConstant attribute)
video_ocr_area_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
video_watermark_prob (data_juicer.utils.constant.StatsKeysConstant attribute)
video_width (data_juicer.utils.constant.StatsKeysConstant attribute)
VideoAestheticsFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_aesthetics_filter)
VideoAspectRatioFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_aspect_ratio_filter)
VideoCaptioningFromAudioMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_captioning_from_audio_mapper)
VideoCaptioningFromFramesMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_captioning_from_frames_mapper)
VideoCaptioningFromSummarizerMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_captioning_from_summarizer_mapper)
VideoCaptioningFromVideoMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_captioning_from_video_mapper)
VideoCapture() (in module data_juicer.ops.filter.video_motion_score_filter)
VideoDeduplicator (class in data_juicer.ops.deduplicator)
(class in data_juicer.ops.deduplicator.video_deduplicator)
VideoDurationFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_duration_filter)
VideoExtractFramesMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_extract_frames_mapper)
VideoFaceBlurMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_face_blur_mapper)
VideoFFmpegWrappedMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper)
VideoFramesTextSimilarityFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_frames_text_similarity_filter)
videohash (data_juicer.utils.constant.HashKeys attribute)
VideoMotionScoreFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_motion_score_filter)
VideoMotionScoreRaftFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_motion_score_raft_filter)
VideoNSFWFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_nsfw_filter)
VideoOcrAreaRatioFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_ocr_area_ratio_filter)
VideoRemoveWatermarkMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_remove_watermark_mapper)
VideoResizeAspectRatioMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_resize_aspect_ratio_mapper)
VideoResizeResolutionMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_resize_resolution_mapper)
VideoResolutionFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_resolution_filter)
VideoSplitByDurationMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_split_by_duration_mapper)
VideoSplitByKeyFrameMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_split_by_key_frame_mapper)
VideoSplitBySceneMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_split_by_scene_mapper)
VideoTaggingFromAudioMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_tagging_from_audio_mapper)
VideoTaggingFromFramesFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_tagging_from_frames_filter)
VideoTaggingFromFramesMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.video_tagging_from_frames_mapper)
VideoWatermarkFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.video_watermark_filter)
W
WhitespaceNormalizationMapper (class in data_juicer.ops.mapper)
(class in data_juicer.ops.mapper.whitespace_normalization_mapper)
word_rep_ratio (data_juicer.utils.constant.StatsKeysConstant attribute)
WordRepetitionFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.word_repetition_filter)
words (data_juicer.utils.constant.InterVars attribute)
words_augmentation() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
words_refinement() (in module data_juicer.ops.common)
(in module data_juicer.ops.common.helper_func)
WordsNumFilter (class in data_juicer.ops.filter)
(class in data_juicer.ops.filter.words_num_filter)
wrap_func_with_nested_access() (in module data_juicer.core.data)
write() (data_juicer.utils.logger_utils.StreamToLoguru method)
Z
ZstdCompressor (class in data_juicer.utils.compress)