Index _ | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | Z _ __init__() (data_juicer.analysis.collector.TextTokenDistCollector method) (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) (data_juicer.analysis.diversity_analysis.DiversityAnalysis method) (data_juicer.analysis.DiversityAnalysis method) (data_juicer.analysis.overall_analysis.OverallAnalysis method) (data_juicer.analysis.OverallAnalysis method) (data_juicer.core.Adapter method) (data_juicer.core.adapter.Adapter method) (data_juicer.core.Analyzer method) (data_juicer.core.analyzer.Analyzer method) (data_juicer.core.data.data_validator.BaseConversationValidator method) (data_juicer.core.data.data_validator.CodeDataValidator method) (data_juicer.core.data.data_validator.DataValidator method) (data_juicer.core.data.data_validator.RequiredFieldsValidator method) (data_juicer.core.data.dataset_builder.DatasetBuilder method) (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.dj_dataset.NestedDatasetDict method) (data_juicer.core.data.dj_dataset.NestedQueryDict method) (data_juicer.core.data.load_strategy.DataLoadStrategy method) (data_juicer.core.data.load_strategy.StrategyKey method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.data.ray_dataset.RayDataset method) (data_juicer.core.data.schema.Schema method) (data_juicer.core.DefaultExecutor method) (data_juicer.core.executor.base.ExecutorBase method) (data_juicer.core.executor.default_executor.DefaultExecutor method) (data_juicer.core.executor.DefaultExecutor method) (data_juicer.core.executor.ExecutorBase method) (data_juicer.core.executor.ray_executor.RayExecutor method) (data_juicer.core.executor.ray_executor.TempDirManager method) (data_juicer.core.ExecutorBase method) (data_juicer.core.Exporter method) (data_juicer.core.exporter.Exporter method) (data_juicer.core.Monitor method) (data_juicer.core.monitor.Monitor method) (data_juicer.core.NestedDataset method) (data_juicer.core.Tracer method) (data_juicer.core.tracer.Tracer method) (data_juicer.download.downloader.DocumentDownloader method) (data_juicer.download.downloader.DocumentExtractor method) (data_juicer.download.downloader.DocumentIterator method) (data_juicer.download.wikipedia.WikipediaDownloader method) (data_juicer.download.wikipedia.WikipediaExtractor method) (data_juicer.download.wikipedia.WikipediaIterator method) (data_juicer.format.csv_formatter.CsvFormatter method) (data_juicer.format.CsvFormatter method) (data_juicer.format.empty_formatter.EmptyFormatter method) (data_juicer.format.empty_formatter.RayEmptyFormatter method) (data_juicer.format.EmptyFormatter method) (data_juicer.format.formatter.LocalFormatter method) (data_juicer.format.formatter.RemoteFormatter method) (data_juicer.format.json_formatter.JsonFormatter method) (data_juicer.format.JsonFormatter method) (data_juicer.format.LocalFormatter method) (data_juicer.format.parquet_formatter.ParquetFormatter method) (data_juicer.format.ParquetFormatter method) (data_juicer.format.RayEmptyFormatter method) (data_juicer.format.RemoteFormatter method) (data_juicer.format.text_formatter.TextFormatter method) (data_juicer.format.TextFormatter method) (data_juicer.format.tsv_formatter.TsvFormatter method) (data_juicer.format.TsvFormatter method) (data_juicer.ops.Aggregator method) (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) (data_juicer.ops.base_op.Aggregator method) (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Grouper method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.base_op.OP method) (data_juicer.ops.base_op.Selector method) (data_juicer.ops.common.helper_func.UnionFind method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionRefine method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReplace method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReweight method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) (data_juicer.ops.common.prompt2prompt_pipeline.LocalBlend method) (data_juicer.ops.common.prompt2prompt_pipeline.P2PCrossAttnProcessor method) (data_juicer.ops.common.prompt2prompt_pipeline.ScoreParams method) (data_juicer.ops.Deduplicator method) (data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.ActorBackend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.Backend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.DedupSet method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RedisBackend method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.IdGenerator method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.ray_document_deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.ray_image_deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.ray_video_deduplicator.RayVideoDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.RayVideoDeduplicator method) (data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method) (data_juicer.ops.deduplicator.VideoDeduplicator method) (data_juicer.ops.Filter method) (data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method) (data_juicer.ops.filter.AlphanumericFilter method) (data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method) (data_juicer.ops.filter.audio_nmf_snr_filter.AudioNMFSNRFilter method) (data_juicer.ops.filter.audio_size_filter.AudioSizeFilter method) (data_juicer.ops.filter.AudioDurationFilter method) (data_juicer.ops.filter.AudioNMFSNRFilter method) (data_juicer.ops.filter.AudioSizeFilter method) (data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter method) (data_juicer.ops.filter.AverageLineLengthFilter method) (data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter method) (data_juicer.ops.filter.CharacterRepetitionFilter method) (data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter method) (data_juicer.ops.filter.FlaggedWordFilter method) (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) (data_juicer.ops.filter.general_field_filter.GeneralFieldFilter method) (data_juicer.ops.filter.GeneralFieldFilter method) (data_juicer.ops.filter.image_aesthetics_filter.ImageAestheticsFilter method) (data_juicer.ops.filter.image_aspect_ratio_filter.ImageAspectRatioFilter method) (data_juicer.ops.filter.image_face_count_filter.ImageFaceCountFilter method) (data_juicer.ops.filter.image_face_ratio_filter.ImageFaceRatioFilter method) (data_juicer.ops.filter.image_nsfw_filter.ImageNSFWFilter method) (data_juicer.ops.filter.image_pair_similarity_filter.ImagePairSimilarityFilter method) (data_juicer.ops.filter.image_shape_filter.ImageShapeFilter method) (data_juicer.ops.filter.image_size_filter.ImageSizeFilter method) (data_juicer.ops.filter.image_text_matching_filter.ImageTextMatchingFilter method) (data_juicer.ops.filter.image_text_similarity_filter.ImageTextSimilarityFilter method) (data_juicer.ops.filter.image_watermark_filter.ImageWatermarkFilter method) (data_juicer.ops.filter.ImageAestheticsFilter method) (data_juicer.ops.filter.ImageAspectRatioFilter method) (data_juicer.ops.filter.ImageFaceCountFilter method) (data_juicer.ops.filter.ImageFaceRatioFilter method) (data_juicer.ops.filter.ImageNSFWFilter method) (data_juicer.ops.filter.ImagePairSimilarityFilter method) (data_juicer.ops.filter.ImageShapeFilter method) (data_juicer.ops.filter.ImageSizeFilter method) (data_juicer.ops.filter.ImageTextMatchingFilter method) (data_juicer.ops.filter.ImageTextSimilarityFilter method) (data_juicer.ops.filter.ImageWatermarkFilter method) (data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter method) (data_juicer.ops.filter.LanguageIDScoreFilter method) (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter method) (data_juicer.ops.filter.MaximumLineLengthFilter method) (data_juicer.ops.filter.perplexity_filter.PerplexityFilter method) (data_juicer.ops.filter.PerplexityFilter method) (data_juicer.ops.filter.phrase_grounding_recall_filter.PhraseGroundingRecallFilter method) (data_juicer.ops.filter.PhraseGroundingRecallFilter method) (data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter method) (data_juicer.ops.filter.SpecialCharactersFilter method) (data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter method) (data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter method) (data_juicer.ops.filter.SpecifiedFieldFilter method) (data_juicer.ops.filter.SpecifiedNumericFieldFilter method) (data_juicer.ops.filter.stopwords_filter.StopWordsFilter method) (data_juicer.ops.filter.StopWordsFilter method) (data_juicer.ops.filter.suffix_filter.SuffixFilter method) (data_juicer.ops.filter.SuffixFilter method) (data_juicer.ops.filter.text_action_filter.TextActionFilter method) (data_juicer.ops.filter.text_entity_dependency_filter.TextEntityDependencyFilter method) (data_juicer.ops.filter.text_length_filter.TextLengthFilter method) (data_juicer.ops.filter.text_pair_similarity_filter.TextPairSimilarityFilter method) (data_juicer.ops.filter.TextActionFilter method) (data_juicer.ops.filter.TextEntityDependencyFilter method) (data_juicer.ops.filter.TextLengthFilter method) (data_juicer.ops.filter.TextPairSimilarityFilter method) (data_juicer.ops.filter.token_num_filter.TokenNumFilter method) (data_juicer.ops.filter.TokenNumFilter method) (data_juicer.ops.filter.video_aesthetics_filter.VideoAestheticsFilter method) (data_juicer.ops.filter.video_aspect_ratio_filter.VideoAspectRatioFilter method) (data_juicer.ops.filter.video_duration_filter.VideoDurationFilter method) (data_juicer.ops.filter.video_frames_text_similarity_filter.VideoFramesTextSimilarityFilter method) (data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.video_motion_score_raft_filter.VideoMotionScoreRaftFilter method) (data_juicer.ops.filter.video_nsfw_filter.VideoNSFWFilter method) (data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method) (data_juicer.ops.filter.video_resolution_filter.VideoResolutionFilter method) (data_juicer.ops.filter.video_tagging_from_frames_filter.VideoTaggingFromFramesFilter method) (data_juicer.ops.filter.video_watermark_filter.VideoWatermarkFilter method) (data_juicer.ops.filter.VideoAestheticsFilter method) (data_juicer.ops.filter.VideoAspectRatioFilter method) (data_juicer.ops.filter.VideoDurationFilter method) (data_juicer.ops.filter.VideoFramesTextSimilarityFilter method) (data_juicer.ops.filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.VideoMotionScoreRaftFilter method) (data_juicer.ops.filter.VideoNSFWFilter method) (data_juicer.ops.filter.VideoOcrAreaRatioFilter method) (data_juicer.ops.filter.VideoResolutionFilter method) (data_juicer.ops.filter.VideoTaggingFromFramesFilter method) (data_juicer.ops.filter.VideoWatermarkFilter method) (data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter method) (data_juicer.ops.filter.WordRepetitionFilter method) (data_juicer.ops.filter.words_num_filter.WordsNumFilter method) (data_juicer.ops.filter.WordsNumFilter method) (data_juicer.ops.Grouper method) (data_juicer.ops.grouper.key_value_grouper.KeyValueGrouper method) (data_juicer.ops.grouper.KeyValueGrouper method) (data_juicer.ops.grouper.naive_grouper.NaiveGrouper method) (data_juicer.ops.grouper.naive_reverse_grouper.NaiveReverseGrouper method) (data_juicer.ops.grouper.NaiveGrouper method) (data_juicer.ops.grouper.NaiveReverseGrouper method) (data_juicer.ops.Mapper method) (data_juicer.ops.mapper.annotation.annotation_mapper.BaseAnnotationMapper method) (data_juicer.ops.mapper.annotation.annotation_mapper.LabelStudioAnnotationMapper method) (data_juicer.ops.mapper.annotation.human_preference_annotation_mapper.HumanPreferenceAnnotationMapper method) (data_juicer.ops.mapper.audio_add_gaussian_noise_mapper.AudioAddGaussianNoiseMapper method) (data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper.AudioFFmpegWrappedMapper method) (data_juicer.ops.mapper.AudioAddGaussianNoiseMapper method) (data_juicer.ops.mapper.AudioFFmpegWrappedMapper method) (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.chinese_convert_mapper.ChineseConvertMapper method) (data_juicer.ops.mapper.ChineseConvertMapper method) (data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper method) (data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper method) (data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper method) (data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper method) (data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper method) (data_juicer.ops.mapper.CleanCopyrightMapper method) (data_juicer.ops.mapper.CleanEmailMapper method) (data_juicer.ops.mapper.CleanHtmlMapper method) (data_juicer.ops.mapper.CleanIpMapper method) (data_juicer.ops.mapper.CleanLinksMapper method) (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.Difference_Area_Generator_Mapper method) (data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper method) (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper method) (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper method) (data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper method) (data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper method) (data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper method) (data_juicer.ops.mapper.extract_tables_from_html_mapper.ExtractTablesFromHtmlMapper method) (data_juicer.ops.mapper.ExtractEntityAttributeMapper method) (data_juicer.ops.mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.ExtractEventMapper method) (data_juicer.ops.mapper.ExtractKeywordMapper method) (data_juicer.ops.mapper.ExtractNicknameMapper method) (data_juicer.ops.mapper.ExtractSupportTextMapper method) (data_juicer.ops.mapper.ExtractTablesFromHtmlMapper method) (data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper method) (data_juicer.ops.mapper.FixUnicodeMapper method) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.generate_qa_from_text_mapper.GenerateQAFromTextMapper method) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.GenerateQAFromTextMapper method) (data_juicer.ops.mapper.HumanPreferenceAnnotationMapper method) (data_juicer.ops.mapper.image_blur_mapper.ImageBlurMapper method) (data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper.ImageCaptioningFromGPT4VMapper method) (data_juicer.ops.mapper.image_captioning_mapper.ImageCaptioningMapper method) (data_juicer.ops.mapper.image_diffusion_mapper.ImageDiffusionMapper method) (data_juicer.ops.mapper.image_face_blur_mapper.ImageFaceBlurMapper method) (data_juicer.ops.mapper.image_remove_background_mapper.ImageRemoveBackgroundMapper method) (data_juicer.ops.mapper.image_segment_mapper.ImageSegmentMapper method) (data_juicer.ops.mapper.image_tagging_mapper.ImageTaggingMapper method) (data_juicer.ops.mapper.ImageBlurMapper method) (data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method) (data_juicer.ops.mapper.ImageCaptioningMapper method) (data_juicer.ops.mapper.ImageDiffusionMapper method) (data_juicer.ops.mapper.ImageFaceBlurMapper method) (data_juicer.ops.mapper.ImageRemoveBackgroundMapper method) (data_juicer.ops.mapper.ImageSegmentMapper method) (data_juicer.ops.mapper.ImageTaggingMapper method) (data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper.Difference_Area_Generator_Mapper method) (data_juicer.ops.mapper.imgdiff_difference_caption_generator_mapper.Difference_Caption_Generator_Mapper method) (data_juicer.ops.mapper.mllm_mapper.MllmMapper method) (data_juicer.ops.mapper.MllmMapper method) (data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper method) (data_juicer.ops.mapper.NlpaugEnMapper method) (data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper method) (data_juicer.ops.mapper.NlpcdaZhMapper method) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method) (data_juicer.ops.mapper.PairPreferenceMapper method) (data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper method) (data_juicer.ops.mapper.PunctuationNormalizationMapper method) (data_juicer.ops.mapper.python_file_mapper.PythonFileMapper method) (data_juicer.ops.mapper.python_lambda_mapper.PythonLambdaMapper method) (data_juicer.ops.mapper.PythonFileMapper method) (data_juicer.ops.mapper.PythonLambdaMapper method) (data_juicer.ops.mapper.query_intent_detection_mapper.QueryIntentDetectionMapper method) (data_juicer.ops.mapper.query_sentiment_detection_mapper.QuerySentimentDetectionMapper method) (data_juicer.ops.mapper.query_topic_detection_mapper.QueryTopicDetectionMapper method) (data_juicer.ops.mapper.QueryIntentDetectionMapper method) (data_juicer.ops.mapper.QuerySentimentDetectionMapper method) (data_juicer.ops.mapper.QueryTopicDetectionMapper method) (data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper method) (data_juicer.ops.mapper.RelationIdentityMapper method) (data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper method) (data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper method) (data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper method) (data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper method) (data_juicer.ops.mapper.remove_non_chinese_character_mapper.RemoveNonChineseCharacterlMapper method) (data_juicer.ops.mapper.remove_repeat_sentences_mapper.RemoveRepeatSentencesMapper method) (data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper method) (data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper method) (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper method) (data_juicer.ops.mapper.RemoveBibliographyMapper method) (data_juicer.ops.mapper.RemoveCommentsMapper method) (data_juicer.ops.mapper.RemoveHeaderMapper method) (data_juicer.ops.mapper.RemoveLongWordsMapper method) (data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper method) (data_juicer.ops.mapper.RemoveRepeatSentencesMapper method) (data_juicer.ops.mapper.RemoveSpecificCharsMapper method) (data_juicer.ops.mapper.RemoveTableTextMapper method) (data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method) (data_juicer.ops.mapper.replace_content_mapper.ReplaceContentMapper method) (data_juicer.ops.mapper.ReplaceContentMapper method) (data_juicer.ops.mapper.sdxl_prompt2prompt_mapper.SDXLPrompt2PromptMapper method) (data_juicer.ops.mapper.SDXLPrompt2PromptMapper method) (data_juicer.ops.mapper.sentence_augmentation_mapper.SentenceAugmentationMapper method) (data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper method) (data_juicer.ops.mapper.SentenceAugmentationMapper method) (data_juicer.ops.mapper.SentenceSplitMapper method) (data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method) (data_juicer.ops.mapper.TextChunkMapper method) (data_juicer.ops.mapper.video_captioning_from_audio_mapper.VideoCaptioningFromAudioMapper method) (data_juicer.ops.mapper.video_captioning_from_frames_mapper.VideoCaptioningFromFramesMapper method) (data_juicer.ops.mapper.video_captioning_from_summarizer_mapper.VideoCaptioningFromSummarizerMapper method) (data_juicer.ops.mapper.video_captioning_from_video_mapper.VideoCaptioningFromVideoMapper method) (data_juicer.ops.mapper.video_extract_frames_mapper.VideoExtractFramesMapper method) (data_juicer.ops.mapper.video_face_blur_mapper.VideoFaceBlurMapper method) (data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper.VideoFFmpegWrappedMapper method) (data_juicer.ops.mapper.video_remove_watermark_mapper.VideoRemoveWatermarkMapper method) (data_juicer.ops.mapper.video_resize_aspect_ratio_mapper.VideoResizeAspectRatioMapper method) (data_juicer.ops.mapper.video_resize_resolution_mapper.VideoResizeResolutionMapper method) (data_juicer.ops.mapper.video_split_by_duration_mapper.VideoSplitByDurationMapper method) (data_juicer.ops.mapper.video_split_by_key_frame_mapper.VideoSplitByKeyFrameMapper method) (data_juicer.ops.mapper.video_split_by_scene_mapper.VideoSplitBySceneMapper method) (data_juicer.ops.mapper.video_tagging_from_audio_mapper.VideoTaggingFromAudioMapper method) (data_juicer.ops.mapper.video_tagging_from_frames_mapper.VideoTaggingFromFramesMapper method) (data_juicer.ops.mapper.VideoCaptioningFromAudioMapper method) (data_juicer.ops.mapper.VideoCaptioningFromFramesMapper method) (data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper method) (data_juicer.ops.mapper.VideoCaptioningFromVideoMapper method) (data_juicer.ops.mapper.VideoExtractFramesMapper method) (data_juicer.ops.mapper.VideoFaceBlurMapper method) (data_juicer.ops.mapper.VideoFFmpegWrappedMapper method) (data_juicer.ops.mapper.VideoRemoveWatermarkMapper method) (data_juicer.ops.mapper.VideoResizeAspectRatioMapper method) (data_juicer.ops.mapper.VideoResizeResolutionMapper method) (data_juicer.ops.mapper.VideoSplitByDurationMapper method) (data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method) (data_juicer.ops.mapper.VideoSplitBySceneMapper method) (data_juicer.ops.mapper.VideoTaggingFromAudioMapper method) (data_juicer.ops.mapper.VideoTaggingFromFramesMapper method) (data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper method) (data_juicer.ops.mapper.WhitespaceNormalizationMapper method) (data_juicer.ops.mixins.EventDrivenMixin method) (data_juicer.ops.mixins.NotificationMixin method) (data_juicer.ops.op_fusion.FusedFilter method) (data_juicer.ops.op_fusion.GeneralFusedOP method) (data_juicer.ops.Selector method) (data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector method) (data_juicer.ops.selector.FrequencySpecifiedFieldSelector method) (data_juicer.ops.selector.random_selector.RandomSelector method) (data_juicer.ops.selector.RandomSelector method) (data_juicer.ops.selector.range_specified_field_selector.RangeSpecifiedFieldSelector method) (data_juicer.ops.selector.RangeSpecifiedFieldSelector method) (data_juicer.ops.selector.tags_specified_field_selector.TagsSpecifiedFieldSelector method) (data_juicer.ops.selector.TagsSpecifiedFieldSelector method) (data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector method) (data_juicer.ops.selector.TopkSpecifiedFieldSelector method) (data_juicer.utils.cache_utils.DatasetCacheControl method) (data_juicer.utils.ckpt_utils.CheckpointManager method) (data_juicer.utils.compress.CacheCompressManager method) (data_juicer.utils.compress.CompressManager method) (data_juicer.utils.fingerprint_utils.Hasher method) (data_juicer.utils.lazy_loader.LazyLoader method) (data_juicer.utils.logger_utils.StreamToLoguru method) (data_juicer.utils.model_utils.APIModel method) (data_juicer.utils.registry.Registry method) A ActorBackend (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) adapt_workloads() (data_juicer.core.Adapter method) (data_juicer.core.adapter.Adapter method) Adapter (class in data_juicer.core) (class in data_juicer.core.adapter) add_column() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) add_key_value_pairs() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) add_message() (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.ExtractEntityRelationMapper method) add_parameters() (data_juicer.ops.base_op.OP method) add_same_content_to_new_column() (in module data_juicer.core.data) (in module data_juicer.core.data.dj_dataset) add_suffix_to_filename() (in module data_juicer.utils.file_utils) add_suffixes() (in module data_juicer.format.formatter) Aggregator (class in data_juicer.ops) (class in data_juicer.ops.base_op) alnum_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) alpha_token_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) AlphanumericFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.alphanumeric_filter) analyze() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) (data_juicer.analysis.diversity_analysis.DiversityAnalysis method) (data_juicer.analysis.DiversityAnalysis method) (data_juicer.analysis.overall_analysis.OverallAnalysis method) (data_juicer.analysis.OverallAnalysis method) analyze_resource_util_list() (data_juicer.core.Monitor static method) (data_juicer.core.monitor.Monitor static method) analyze_single_resource_util() (data_juicer.core.Monitor static method) (data_juicer.core.monitor.Monitor static method) analyze_small_batch() (data_juicer.core.Adapter method) (data_juicer.core.adapter.Adapter method) Analyzer (class in data_juicer.core) (class in data_juicer.core.analyzer) APIModel (class in data_juicer.utils.model_utils) aspect_ratios (data_juicer.utils.constant.StatsKeysConstant attribute) assertDatasetEqual() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase method) AttentionControl (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionControlEdit (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionRefine (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionReplace (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionReweight (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionStore (class in data_juicer.ops.common.prompt2prompt_pipeline) attribute_descriptions (data_juicer.utils.constant.MetaKeys attribute) attribute_summary() (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) attribute_support_texts (data_juicer.utils.constant.MetaKeys attribute) attributes (data_juicer.utils.constant.MetaKeys attribute) audio (data_juicer.utils.mm_utils.SpecialTokens attribute) audio_duration (data_juicer.utils.constant.StatsKeysConstant attribute) audio_nmf_snr (data_juicer.utils.constant.StatsKeysConstant attribute) audio_sizes (data_juicer.utils.constant.StatsKeysConstant attribute) AudioAddGaussianNoiseMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.audio_add_gaussian_noise_mapper) AudioDurationFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.audio_duration_filter) AudioFFmpegWrappedMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper) AudioNMFSNRFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.audio_nmf_snr_filter) AudioSizeFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.audio_size_filter) AV_STREAM_THREAD_TYPE (in module data_juicer.utils.mm_utils) avaliable_detectors (data_juicer.ops.mapper.video_split_by_scene_mapper.VideoSplitBySceneMapper attribute) (data_juicer.ops.mapper.VideoSplitBySceneMapper attribute) AverageLineLengthFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.average_line_length_filter) avg_line_length (data_juicer.utils.constant.StatsKeysConstant attribute) avg_split_string_list_under_limit() (in module data_juicer.utils.common_utils) B Backend (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) balanced_union_find() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) BaseAnnotationMapper (class in data_juicer.ops.mapper.annotation.annotation_mapper) BaseCompressor (class in data_juicer.utils.compress) BaseConversationValidator (class in data_juicer.core.data.data_validator) BaseFormatter (class in data_juicer.format.formatter) batch_meta (data_juicer.utils.constant.Fields attribute) batch_size_strategy() (data_juicer.core.Adapter method) (data_juicer.core.adapter.Adapter method) BatchMetaKeys (class in data_juicer.utils.constant) bbox_tag (data_juicer.utils.constant.MetaKeys attribute) between_steps() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) BTSUnionFind (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) build_input() (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.LLMQualityScoreFilter method) (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method) (data_juicer.ops.mapper.PairPreferenceMapper method) C CacheCompressManager (class in data_juicer.utils.compress) calc_minhash() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) calculate_hash() (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.ray_document_deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.ray_image_deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.ray_video_deduplicator.RayVideoDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.RayVideoDeduplicator method) calculate_np() (in module data_juicer.utils.process_utils) calculate_resized_dimensions() (in module data_juicer.utils.mm_utils) CalibrateQAMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.calibrate_qa_mapper) CalibrateQueryMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.calibrate_query_mapper) CalibrateResponseMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.calibrate_response_mapper) call_gpt_vision_api() (in module data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper) catch_map_batches_exception() (in module data_juicer.ops.base_op) catch_map_single_exception() (in module data_juicer.ops.base_op) category_to_hist() (data_juicer.analysis.measure.RelatedTTestMeasure static method) char_rep_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) CharacterRepetitionFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.character_repetition_filter) check_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method) check_inputs() (data_juicer.ops.common.prompt2prompt_pipeline.Prompt2PromptPipeline method) check_model() (in module data_juicer.utils.model_utils) check_ops_to_skip() (data_juicer.utils.ckpt_utils.CheckpointManager method) check_packages() (data_juicer.utils.lazy_loader.LazyLoader class method) CheckpointManager (class in data_juicer.utils.ckpt_utils) ChineseConvertMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.chinese_convert_mapper) clean_nltk_cache() (in module data_juicer.utils.nltk_utils) CleanCopyrightMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.clean_copyright_mapper) CleanEmailMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.clean_email_mapper) CleanHtmlMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.clean_html_mapper) CleanIpMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.clean_ip_mapper) CleanLinksMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.clean_links_mapper) cleanup_cache_files() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) (data_juicer.utils.compress.CacheCompressManager method) cleanup_compressed_cache_files() (in module data_juicer.utils.compress) clear() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) close_video() (in module data_juicer.utils.mm_utils) CodeDataValidator (class in data_juicer.core.data.data_validator) collect() (data_juicer.analysis.collector.TextTokenDistCollector method) column_types (data_juicer.core.data.schema.Schema attribute), [1] columns (data_juicer.core.data.schema.Schema attribute), [1] ColumnWiseAnalysis (class in data_juicer.analysis) (class in data_juicer.analysis.column_wise_analysis) communication() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) compare_text_index() (in module data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper) compress() (data_juicer.utils.compress.BaseCompressor static method) (data_juicer.utils.compress.CacheCompressManager method) (data_juicer.utils.compress.CompressManager method) (data_juicer.utils.compress.Compressor class method) (data_juicer.utils.compress.GzipCompressor static method) (data_juicer.utils.compress.Lz4Compressor static method) (data_juicer.utils.compress.ZstdCompressor static method) (in module data_juicer.utils.compress) CompressionOff (class in data_juicer.utils.compress) CompressManager (class in data_juicer.utils.compress) Compressor (class in data_juicer.utils.compress) compressors (data_juicer.utils.compress.Compressor attribute) compute() (data_juicer.analysis.diversity_analysis.DiversityAnalysis method) (data_juicer.analysis.DiversityAnalysis method) compute_flow() (data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.video_motion_score_raft_filter.VideoMotionScoreRaftFilter method) (data_juicer.ops.filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.VideoMotionScoreRaftFilter method) compute_hash() (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.Deduplicator method) (data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method) (data_juicer.ops.deduplicator.VideoDeduplicator method) compute_nmf_snr() (in module data_juicer.ops.filter.audio_nmf_snr_filter) compute_stats_batched() (data_juicer.ops.base_op.Filter method) (data_juicer.ops.Filter method) (data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method) (data_juicer.ops.filter.AlphanumericFilter method) (data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter method) (data_juicer.ops.filter.AverageLineLengthFilter method) (data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter method) (data_juicer.ops.filter.CharacterRepetitionFilter method) (data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter method) (data_juicer.ops.filter.FlaggedWordFilter method) (data_juicer.ops.filter.image_aspect_ratio_filter.ImageAspectRatioFilter method) (data_juicer.ops.filter.ImageAspectRatioFilter method) (data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter method) (data_juicer.ops.filter.MaximumLineLengthFilter method) (data_juicer.ops.filter.perplexity_filter.PerplexityFilter method) (data_juicer.ops.filter.PerplexityFilter method) (data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter method) (data_juicer.ops.filter.SpecialCharactersFilter method) (data_juicer.ops.filter.text_length_filter.TextLengthFilter method) (data_juicer.ops.filter.TextLengthFilter method) (data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter method) (data_juicer.ops.filter.WordRepetitionFilter method) (data_juicer.ops.filter.words_num_filter.WordsNumFilter method) (data_juicer.ops.filter.WordsNumFilter method) (data_juicer.ops.op_fusion.FusedFilter method) compute_stats_single() (data_juicer.ops.base_op.Filter method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.Filter method) (data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method) (data_juicer.ops.filter.audio_nmf_snr_filter.AudioNMFSNRFilter method) (data_juicer.ops.filter.audio_size_filter.AudioSizeFilter method) (data_juicer.ops.filter.AudioDurationFilter method) (data_juicer.ops.filter.AudioNMFSNRFilter method) (data_juicer.ops.filter.AudioSizeFilter method) (data_juicer.ops.filter.general_field_filter.GeneralFieldFilter method) (data_juicer.ops.filter.GeneralFieldFilter method) (data_juicer.ops.filter.image_aesthetics_filter.ImageAestheticsFilter method) (data_juicer.ops.filter.image_face_count_filter.ImageFaceCountFilter method) (data_juicer.ops.filter.image_face_ratio_filter.ImageFaceRatioFilter method) (data_juicer.ops.filter.image_nsfw_filter.ImageNSFWFilter method) (data_juicer.ops.filter.image_pair_similarity_filter.ImagePairSimilarityFilter method) (data_juicer.ops.filter.image_shape_filter.ImageShapeFilter method) (data_juicer.ops.filter.image_size_filter.ImageSizeFilter method) (data_juicer.ops.filter.image_text_matching_filter.ImageTextMatchingFilter method) (data_juicer.ops.filter.image_text_similarity_filter.ImageTextSimilarityFilter method) (data_juicer.ops.filter.image_watermark_filter.ImageWatermarkFilter method) (data_juicer.ops.filter.ImageAestheticsFilter method) (data_juicer.ops.filter.ImageFaceCountFilter method) (data_juicer.ops.filter.ImageFaceRatioFilter method) (data_juicer.ops.filter.ImageNSFWFilter method) (data_juicer.ops.filter.ImagePairSimilarityFilter method) (data_juicer.ops.filter.ImageShapeFilter method) (data_juicer.ops.filter.ImageSizeFilter method) (data_juicer.ops.filter.ImageTextMatchingFilter method) (data_juicer.ops.filter.ImageTextSimilarityFilter method) (data_juicer.ops.filter.ImageWatermarkFilter method) (data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter method) (data_juicer.ops.filter.LanguageIDScoreFilter method) (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.phrase_grounding_recall_filter.PhraseGroundingRecallFilter method) (data_juicer.ops.filter.PhraseGroundingRecallFilter method) (data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter method) (data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter method) (data_juicer.ops.filter.SpecifiedFieldFilter method) (data_juicer.ops.filter.SpecifiedNumericFieldFilter method) (data_juicer.ops.filter.stopwords_filter.StopWordsFilter method) (data_juicer.ops.filter.StopWordsFilter method) (data_juicer.ops.filter.suffix_filter.SuffixFilter method) (data_juicer.ops.filter.SuffixFilter method) (data_juicer.ops.filter.text_action_filter.TextActionFilter method) (data_juicer.ops.filter.text_entity_dependency_filter.TextEntityDependencyFilter method) (data_juicer.ops.filter.text_pair_similarity_filter.TextPairSimilarityFilter method) (data_juicer.ops.filter.TextActionFilter method) (data_juicer.ops.filter.TextEntityDependencyFilter method) (data_juicer.ops.filter.TextPairSimilarityFilter method) (data_juicer.ops.filter.token_num_filter.TokenNumFilter method) (data_juicer.ops.filter.TokenNumFilter method) (data_juicer.ops.filter.video_aesthetics_filter.VideoAestheticsFilter method) (data_juicer.ops.filter.video_aspect_ratio_filter.VideoAspectRatioFilter method) (data_juicer.ops.filter.video_duration_filter.VideoDurationFilter method) (data_juicer.ops.filter.video_frames_text_similarity_filter.VideoFramesTextSimilarityFilter method) (data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.video_nsfw_filter.VideoNSFWFilter method) (data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method) (data_juicer.ops.filter.video_resolution_filter.VideoResolutionFilter method) (data_juicer.ops.filter.video_tagging_from_frames_filter.VideoTaggingFromFramesFilter method) (data_juicer.ops.filter.video_watermark_filter.VideoWatermarkFilter method) (data_juicer.ops.filter.VideoAestheticsFilter method) (data_juicer.ops.filter.VideoAspectRatioFilter method) (data_juicer.ops.filter.VideoDurationFilter method) (data_juicer.ops.filter.VideoFramesTextSimilarityFilter method) (data_juicer.ops.filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.VideoNSFWFilter method) (data_juicer.ops.filter.VideoOcrAreaRatioFilter method) (data_juicer.ops.filter.VideoResolutionFilter method) (data_juicer.ops.filter.VideoTaggingFromFramesFilter method) (data_juicer.ops.filter.VideoWatermarkFilter method) config_backup() (in module data_juicer.config.config) CONFIG_VALIDATION_RULES (data_juicer.core.data.config_validator.ConfigValidator attribute) (data_juicer.core.data.load_strategy.DefaultArxivDataLoadStrategy attribute) (data_juicer.core.data.load_strategy.DefaultCommonCrawlDataLoadStrategy attribute) (data_juicer.core.data.load_strategy.DefaultHuggingfaceDataLoadStrategy attribute) (data_juicer.core.data.load_strategy.DefaultLocalDataLoadStrategy attribute) (data_juicer.core.data.load_strategy.DefaultWikiDataLoadStrategy attribute) (data_juicer.core.data.load_strategy.RayHuggingfaceDataLoadStrategy attribute) (data_juicer.core.data.load_strategy.RayLocalJsonDataLoadStrategy attribute) ConfigValidationError ConfigValidator (class in data_juicer.core.data.config_validator) context (data_juicer.utils.constant.Fields attribute) convert_arrow_to_python() (in module data_juicer.ops.base_op) convert_dict_list_to_list_dict() (in module data_juicer.ops.base_op) convert_list_dict_to_dict_list() (in module data_juicer.ops.base_op) convert_to_absolute_paths() (in module data_juicer.core.data.ray_dataset) copy_data() (in module data_juicer.utils.file_utils) create_controller() (in module data_juicer.ops.common.prompt2prompt_pipeline) create_directory_if_not_exists() (in module data_juicer.utils.file_utils) create_executor() (data_juicer.core.executor.ExecutorFactory static method) (data_juicer.core.executor.factory.ExecutorFactory static method) (data_juicer.core.ExecutorFactory static method) create_physical_resource_alias() (in module data_juicer.utils.nltk_utils) create_replacer() (in module data_juicer.ops.mapper.video_split_by_duration_mapper) (in module data_juicer.ops.mapper.video_split_by_key_frame_mapper) CrossEntropyMeasure (class in data_juicer.analysis.measure) CsvFormatter (class in data_juicer.format) (class in data_juicer.format.csv_formatter) cuda_device_count() (in module data_juicer) cut_video_by_seconds() (in module data_juicer.utils.mm_utils) D data_juicer module data_juicer.analysis module data_juicer.analysis.collector module data_juicer.analysis.column_wise_analysis module data_juicer.analysis.diversity_analysis module data_juicer.analysis.draw module data_juicer.analysis.measure module data_juicer.analysis.overall_analysis module data_juicer.config module data_juicer.config.config module data_juicer.core module data_juicer.core.adapter module data_juicer.core.analyzer module data_juicer.core.data module data_juicer.core.data.config_validator module data_juicer.core.data.data_validator module data_juicer.core.data.dataset_builder module data_juicer.core.data.dj_dataset module data_juicer.core.data.load_strategy module data_juicer.core.data.ray_dataset module data_juicer.core.data.schema module data_juicer.core.executor module data_juicer.core.executor.base module data_juicer.core.executor.default_executor module data_juicer.core.executor.factory module data_juicer.core.executor.ray_executor module data_juicer.core.exporter module data_juicer.core.monitor module data_juicer.core.tracer module data_juicer.download module data_juicer.download.commoncrawl module data_juicer.download.downloader module data_juicer.download.wikipedia module data_juicer.format module data_juicer.format.csv_formatter module data_juicer.format.empty_formatter module data_juicer.format.formatter module data_juicer.format.json_formatter module data_juicer.format.load module data_juicer.format.parquet_formatter module data_juicer.format.text_formatter module data_juicer.format.tsv_formatter module data_juicer.ops module data_juicer.ops.aggregator module data_juicer.ops.aggregator.entity_attribute_aggregator module data_juicer.ops.aggregator.meta_tags_aggregator module data_juicer.ops.aggregator.most_relevant_entities_aggregator module data_juicer.ops.aggregator.nested_aggregator module data_juicer.ops.base_op module data_juicer.ops.common module data_juicer.ops.common.helper_func module data_juicer.ops.common.prompt2prompt_pipeline module data_juicer.ops.common.special_characters module data_juicer.ops.deduplicator module data_juicer.ops.deduplicator.document_deduplicator module data_juicer.ops.deduplicator.document_minhash_deduplicator module data_juicer.ops.deduplicator.document_simhash_deduplicator module data_juicer.ops.deduplicator.image_deduplicator module data_juicer.ops.deduplicator.ray_basic_deduplicator module data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator module data_juicer.ops.deduplicator.ray_document_deduplicator module data_juicer.ops.deduplicator.ray_image_deduplicator module data_juicer.ops.deduplicator.ray_video_deduplicator module data_juicer.ops.deduplicator.video_deduplicator module data_juicer.ops.filter module data_juicer.ops.filter.alphanumeric_filter module data_juicer.ops.filter.audio_duration_filter module data_juicer.ops.filter.audio_nmf_snr_filter module data_juicer.ops.filter.audio_size_filter module data_juicer.ops.filter.average_line_length_filter module data_juicer.ops.filter.character_repetition_filter module data_juicer.ops.filter.flagged_words_filter module data_juicer.ops.filter.general_field_filter module data_juicer.ops.filter.image_aesthetics_filter module data_juicer.ops.filter.image_aspect_ratio_filter module data_juicer.ops.filter.image_face_count_filter module data_juicer.ops.filter.image_face_ratio_filter module data_juicer.ops.filter.image_nsfw_filter module data_juicer.ops.filter.image_pair_similarity_filter module data_juicer.ops.filter.image_shape_filter module data_juicer.ops.filter.image_size_filter module data_juicer.ops.filter.image_text_matching_filter module data_juicer.ops.filter.image_text_similarity_filter module data_juicer.ops.filter.image_watermark_filter module data_juicer.ops.filter.language_id_score_filter module data_juicer.ops.filter.llm_difficulty_score_filter module data_juicer.ops.filter.llm_quality_score_filter module data_juicer.ops.filter.maximum_line_length_filter module data_juicer.ops.filter.perplexity_filter module data_juicer.ops.filter.phrase_grounding_recall_filter module data_juicer.ops.filter.special_characters_filter module data_juicer.ops.filter.specified_field_filter module data_juicer.ops.filter.specified_numeric_field_filter module data_juicer.ops.filter.stopwords_filter module data_juicer.ops.filter.suffix_filter module data_juicer.ops.filter.text_action_filter module data_juicer.ops.filter.text_entity_dependency_filter module data_juicer.ops.filter.text_length_filter module data_juicer.ops.filter.text_pair_similarity_filter module data_juicer.ops.filter.token_num_filter module data_juicer.ops.filter.video_aesthetics_filter module data_juicer.ops.filter.video_aspect_ratio_filter module data_juicer.ops.filter.video_duration_filter module data_juicer.ops.filter.video_frames_text_similarity_filter module data_juicer.ops.filter.video_motion_score_filter module data_juicer.ops.filter.video_motion_score_raft_filter module data_juicer.ops.filter.video_nsfw_filter module data_juicer.ops.filter.video_ocr_area_ratio_filter module data_juicer.ops.filter.video_resolution_filter module data_juicer.ops.filter.video_tagging_from_frames_filter module data_juicer.ops.filter.video_watermark_filter module data_juicer.ops.filter.word_repetition_filter module data_juicer.ops.filter.words_num_filter module data_juicer.ops.grouper module data_juicer.ops.grouper.key_value_grouper module data_juicer.ops.grouper.naive_grouper module data_juicer.ops.grouper.naive_reverse_grouper module data_juicer.ops.load module data_juicer.ops.mapper module data_juicer.ops.mapper.annotation module data_juicer.ops.mapper.annotation.annotation_mapper module data_juicer.ops.mapper.annotation.human_preference_annotation_mapper module data_juicer.ops.mapper.audio_add_gaussian_noise_mapper module data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper module data_juicer.ops.mapper.calibrate_qa_mapper module data_juicer.ops.mapper.calibrate_query_mapper module data_juicer.ops.mapper.calibrate_response_mapper module data_juicer.ops.mapper.chinese_convert_mapper module data_juicer.ops.mapper.clean_copyright_mapper module data_juicer.ops.mapper.clean_email_mapper module data_juicer.ops.mapper.clean_html_mapper module data_juicer.ops.mapper.clean_ip_mapper module data_juicer.ops.mapper.clean_links_mapper module data_juicer.ops.mapper.dialog_intent_detection_mapper module data_juicer.ops.mapper.dialog_sentiment_detection_mapper module data_juicer.ops.mapper.dialog_sentiment_intensity_mapper module data_juicer.ops.mapper.dialog_topic_detection_mapper module data_juicer.ops.mapper.expand_macro_mapper module data_juicer.ops.mapper.extract_entity_attribute_mapper module data_juicer.ops.mapper.extract_entity_relation_mapper module data_juicer.ops.mapper.extract_event_mapper module data_juicer.ops.mapper.extract_keyword_mapper module data_juicer.ops.mapper.extract_nickname_mapper module data_juicer.ops.mapper.extract_support_text_mapper module data_juicer.ops.mapper.extract_tables_from_html_mapper module data_juicer.ops.mapper.fix_unicode_mapper module data_juicer.ops.mapper.generate_qa_from_examples_mapper module data_juicer.ops.mapper.generate_qa_from_text_mapper module data_juicer.ops.mapper.image_blur_mapper module data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper module data_juicer.ops.mapper.image_captioning_mapper module data_juicer.ops.mapper.image_diffusion_mapper module data_juicer.ops.mapper.image_face_blur_mapper module data_juicer.ops.mapper.image_remove_background_mapper module data_juicer.ops.mapper.image_segment_mapper module data_juicer.ops.mapper.image_tagging_mapper module data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper module data_juicer.ops.mapper.imgdiff_difference_caption_generator_mapper module data_juicer.ops.mapper.mllm_mapper module data_juicer.ops.mapper.nlpaug_en_mapper module data_juicer.ops.mapper.nlpcda_zh_mapper module data_juicer.ops.mapper.optimize_qa_mapper module data_juicer.ops.mapper.optimize_query_mapper module data_juicer.ops.mapper.optimize_response_mapper module data_juicer.ops.mapper.pair_preference_mapper module data_juicer.ops.mapper.punctuation_normalization_mapper module data_juicer.ops.mapper.python_file_mapper module data_juicer.ops.mapper.python_lambda_mapper module data_juicer.ops.mapper.query_intent_detection_mapper module data_juicer.ops.mapper.query_sentiment_detection_mapper module data_juicer.ops.mapper.query_topic_detection_mapper module data_juicer.ops.mapper.relation_identity_mapper module data_juicer.ops.mapper.remove_bibliography_mapper module data_juicer.ops.mapper.remove_comments_mapper module data_juicer.ops.mapper.remove_header_mapper module data_juicer.ops.mapper.remove_long_words_mapper module data_juicer.ops.mapper.remove_non_chinese_character_mapper module data_juicer.ops.mapper.remove_repeat_sentences_mapper module data_juicer.ops.mapper.remove_specific_chars_mapper module data_juicer.ops.mapper.remove_table_text_mapper module data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper module data_juicer.ops.mapper.replace_content_mapper module data_juicer.ops.mapper.sdxl_prompt2prompt_mapper module data_juicer.ops.mapper.sentence_augmentation_mapper module data_juicer.ops.mapper.sentence_split_mapper module data_juicer.ops.mapper.text_chunk_mapper module data_juicer.ops.mapper.video_captioning_from_audio_mapper module data_juicer.ops.mapper.video_captioning_from_frames_mapper module data_juicer.ops.mapper.video_captioning_from_summarizer_mapper module data_juicer.ops.mapper.video_captioning_from_video_mapper module data_juicer.ops.mapper.video_extract_frames_mapper module data_juicer.ops.mapper.video_face_blur_mapper module data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper module data_juicer.ops.mapper.video_remove_watermark_mapper module data_juicer.ops.mapper.video_resize_aspect_ratio_mapper module data_juicer.ops.mapper.video_resize_resolution_mapper module data_juicer.ops.mapper.video_split_by_duration_mapper module data_juicer.ops.mapper.video_split_by_key_frame_mapper module data_juicer.ops.mapper.video_split_by_scene_mapper module data_juicer.ops.mapper.video_tagging_from_audio_mapper module data_juicer.ops.mapper.video_tagging_from_frames_mapper module data_juicer.ops.mapper.whitespace_normalization_mapper module data_juicer.ops.mixins module data_juicer.ops.op_fusion module data_juicer.ops.selector module data_juicer.ops.selector.frequency_specified_field_selector module data_juicer.ops.selector.random_selector module data_juicer.ops.selector.range_specified_field_selector module data_juicer.ops.selector.tags_specified_field_selector module data_juicer.ops.selector.topk_specified_field_selector module data_juicer.tools module data_juicer.utils module data_juicer.utils.asset_utils module data_juicer.utils.availability_utils module data_juicer.utils.cache_utils module data_juicer.utils.ckpt_utils module data_juicer.utils.common_utils module data_juicer.utils.compress module data_juicer.utils.constant module data_juicer.utils.file_utils module data_juicer.utils.fingerprint_utils module data_juicer.utils.lazy_loader module data_juicer.utils.logger_utils module data_juicer.utils.mm_utils module data_juicer.utils.model_utils module data_juicer.utils.nltk_utils module data_juicer.utils.process_utils module data_juicer.utils.registry module data_juicer.utils.resource_utils module data_juicer.utils.sample module data_juicer.utils.unittest_utils module data_source (data_juicer.core.data.load_strategy.StrategyKey attribute) data_type (data_juicer.core.data.load_strategy.StrategyKey attribute) DataJuicerFormatValidator (class in data_juicer.core.data.data_validator) DataJuicerTestCaseBase (class in data_juicer.utils.unittest_utils) DataLoadStrategy (class in data_juicer.core.data.load_strategy) DataLoadStrategyRegistry (class in data_juicer.core.data.load_strategy) dataset_cache_control() (in module data_juicer.utils.cache_utils) DatasetBuilder (class in data_juicer.core.data.dataset_builder) DatasetCacheControl (class in data_juicer.utils.cache_utils) DataValidationError DataValidator (class in data_juicer.core.data.data_validator) DataValidatorRegistry (class in data_juicer.core.data.data_validator) decompress() (data_juicer.utils.compress.CacheCompressManager method) (data_juicer.utils.compress.CompressManager method) (in module data_juicer.utils.compress) Deduplicator (class in data_juicer.ops) (class in data_juicer.ops.base_op) DedupSet (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) DEFAULT_ANALYSIS_PATTERN (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) DEFAULT_ANALYSIS_TEMPLATE (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) DEFAULT_ATTR_PATTERN_TEMPLATE (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute) (data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute) DEFAULT_CANDIDATES_TEMPLATE (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) DEFAULT_COMPLETION_DELIMITER (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractKeywordMapper attribute) DEFAULT_CONTINUE_PROMPT (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) DEFAULT_DEMON_PATTERN (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute) (data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute) DEFAULT_ENTITY_PATTERN (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) DEFAULT_ENTITY_TYPES (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) DEFAULT_EXAMPLE_PROMPT (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) DEFAULT_EXAMPLE_TEMPLATE (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) DEFAULT_FIELD_TEMPLATE (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter attribute) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter attribute) (data_juicer.ops.filter.LLMDifficultyScoreFilter attribute) (data_juicer.ops.filter.LLMQualityScoreFilter attribute) DEFAULT_IF_LOOP_PROMPT (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) DEFAULT_INPUT_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute) (data_juicer.ops.aggregator.NestedAggregator attribute) (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter attribute) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter attribute) (data_juicer.ops.filter.LLMDifficultyScoreFilter attribute) (data_juicer.ops.filter.LLMQualityScoreFilter attribute) (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute) (data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper attribute) (data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper attribute) (data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper attribute) (data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute) (data_juicer.ops.mapper.ExtractEventMapper attribute) (data_juicer.ops.mapper.ExtractNicknameMapper attribute) (data_juicer.ops.mapper.ExtractSupportTextMapper attribute) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute) (data_juicer.ops.mapper.OptimizeQAMapper attribute) (data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper attribute) (data_juicer.ops.mapper.PairPreferenceMapper attribute) (data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper attribute) (data_juicer.ops.mapper.RelationIdentityMapper attribute) DEFAULT_INTENSITY_PATTERN (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute) DEFAULT_INTENSITY_TEMPLATE (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute) DEFAULT_LABEL_CONFIG (data_juicer.ops.mapper.annotation.human_preference_annotation_mapper.HumanPreferenceAnnotationMapper attribute) (data_juicer.ops.mapper.HumanPreferenceAnnotationMapper attribute) DEFAULT_LABELS_PATTERN (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) DEFAULT_LABELS_TEMPLATE (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) DEFAULT_OUTPUT_PATTERN (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper attribute) (data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper attribute) (data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper attribute) (data_juicer.ops.mapper.ExtractEventMapper attribute) (data_juicer.ops.mapper.ExtractKeywordMapper attribute) (data_juicer.ops.mapper.ExtractNicknameMapper attribute) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute) (data_juicer.ops.mapper.OptimizeQAMapper attribute) (data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper attribute) (data_juicer.ops.mapper.PairPreferenceMapper attribute) DEFAULT_OUTPUT_PATTERN_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper attribute) (data_juicer.ops.mapper.RelationIdentityMapper attribute) DEFAULT_PROMPT_TEMPLATE (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractKeywordMapper attribute) DEFAULT_QA_PAIR_TEMPLATE (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute) (data_juicer.ops.mapper.OptimizeQAMapper attribute) DEFAULT_QUERY_TEMPLATE (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) DEFAULT_RECORD_DELIMITER (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) DEFAULT_REFERENCE_TEMPLATE (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.CalibrateQAMapper attribute) DEFAULT_RELATION_PATTERN (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) DEFAULT_RESPONSE_TEMPLATE (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) DEFAULT_SUB_DOC_TEMPLATE (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute) (data_juicer.ops.aggregator.NestedAggregator attribute) DEFAULT_SYSTEM_PROMPT (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute) (data_juicer.ops.aggregator.NestedAggregator attribute) (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter attribute) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter attribute) (data_juicer.ops.filter.LLMDifficultyScoreFilter attribute) (data_juicer.ops.filter.LLMQualityScoreFilter attribute) (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.calibrate_query_mapper.CalibrateQueryMapper attribute) (data_juicer.ops.mapper.calibrate_response_mapper.CalibrateResponseMapper attribute) (data_juicer.ops.mapper.CalibrateQAMapper attribute) (data_juicer.ops.mapper.CalibrateQueryMapper attribute) (data_juicer.ops.mapper.CalibrateResponseMapper attribute) (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute) (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute) (data_juicer.ops.mapper.DialogTopicDetectionMapper attribute) (data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper attribute) (data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper attribute) (data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper attribute) (data_juicer.ops.mapper.ExtractEventMapper attribute) (data_juicer.ops.mapper.ExtractNicknameMapper attribute) (data_juicer.ops.mapper.ExtractSupportTextMapper attribute) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper attribute) (data_juicer.ops.mapper.optimize_query_mapper.OptimizeQueryMapper attribute) (data_juicer.ops.mapper.optimize_response_mapper.OptimizeResponseMapper attribute) (data_juicer.ops.mapper.OptimizeQAMapper attribute) (data_juicer.ops.mapper.OptimizeQueryMapper attribute) (data_juicer.ops.mapper.OptimizeResponseMapper attribute) (data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper attribute) (data_juicer.ops.mapper.PairPreferenceMapper attribute) DEFAULT_SYSTEM_PROMPT_TEMPLATE (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper attribute) (data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute) (data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper attribute) (data_juicer.ops.mapper.RelationIdentityMapper attribute) DEFAULT_SYSTEM_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute) DEFAULT_TAG_TEMPLATE (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) DEFAULT_TARGET_TAG_TEMPLATE (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) DEFAULT_TUPLE_DELIMITER (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper attribute) (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute) DefaultArxivDataLoadStrategy (class in data_juicer.core.data.load_strategy) DefaultCommonCrawlDataLoadStrategy (class in data_juicer.core.data.load_strategy) DefaultDataLoadStrategy (class in data_juicer.core.data.load_strategy) DefaultExecutor (class in data_juicer.core) (class in data_juicer.core.executor) (class in data_juicer.core.executor.default_executor) DefaultHuggingfaceDataLoadStrategy (class in data_juicer.core.data.load_strategy) DefaultLocalDataLoadStrategy (class in data_juicer.core.data.load_strategy) DefaultModelScopeDataLoadStrategy (class in data_juicer.core.data.load_strategy) DefaultWikiDataLoadStrategy (class in data_juicer.core.data.load_strategy) detect_faces() (in module data_juicer.utils.mm_utils) dialog_intent_labels (data_juicer.utils.constant.MetaKeys attribute) dialog_intent_labels_analysis (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_intensity (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_intensity_analysis (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_labels (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_labels_analysis (data_juicer.utils.constant.MetaKeys attribute) dialog_topic_labels (data_juicer.utils.constant.MetaKeys attribute) dialog_topic_labels_analysis (data_juicer.utils.constant.MetaKeys attribute) DialogIntentDetectionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.dialog_intent_detection_mapper) DialogSentimentDetectionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.dialog_sentiment_detection_mapper) DialogSentimentIntensityMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.dialog_sentiment_intensity_mapper) DialogTopicDetectionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.dialog_topic_detection_mapper) dict_to_hash() (in module data_juicer.utils.common_utils) Difference_Area_Generator_Mapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper) Difference_Caption_Generator_Mapper (class in data_juicer.ops.mapper.imgdiff_difference_caption_generator_mapper) dispatch (data_juicer.utils.fingerprint_utils.Hasher attribute) display_config() (in module data_juicer.config.config) distribute_edge() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) DiversityAnalysis (class in data_juicer.analysis) (class in data_juicer.analysis.diversity_analysis) dj_configs (data_juicer.utils.constant.JobRequiredKeys attribute) DJDataset (class in data_juicer.core.data) (class in data_juicer.core.data.dj_dataset) DocumentDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.document_deduplicator) DocumentDownloader (class in data_juicer.download.downloader) DocumentExtractor (class in data_juicer.download.downloader) DocumentIterator (class in data_juicer.download.downloader) DocumentMinhashDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.document_minhash_deduplicator) DocumentSimhashDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.document_simhash_deduplicator) download() (data_juicer.download.downloader.DocumentDownloader method) (data_juicer.download.wikipedia.WikipediaDownloader method) download_and_extract() (in module data_juicer.download.downloader) download_wikipedia() (in module data_juicer.download.wikipedia) draw_box() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) draw_heatmap() (in module data_juicer.analysis.draw) draw_hist() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) draw_resource_util_graph() (data_juicer.core.Monitor static method) (data_juicer.core.monitor.Monitor static method) draw_wordcloud() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) dup_idx() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) DYNAMIC_FIELDS (data_juicer.core.Monitor attribute) (data_juicer.core.monitor.Monitor attribute) E edge_redistribution() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) EdgeBuffer (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) EMPTY_HASH_VALUE (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator attribute) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator attribute) (data_juicer.ops.deduplicator.RayBasicDeduplicator attribute) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator attribute) empty_history() (data_juicer.ops.base_op.OP method) EmptyControl (class in data_juicer.ops.common.prompt2prompt_pipeline) EmptyFormatter (class in data_juicer.format) (class in data_juicer.format.empty_formatter) ensure_nltk_resource() (in module data_juicer.utils.nltk_utils) entity (data_juicer.utils.constant.MetaKeys attribute) entity_attribute (data_juicer.utils.constant.BatchMetaKeys attribute) entity_description (data_juicer.utils.constant.MetaKeys attribute) entity_name (data_juicer.utils.constant.MetaKeys attribute) entity_type (data_juicer.utils.constant.MetaKeys attribute) EntityAttributeAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.entity_attribute_aggregator) EntropyMeasure (class in data_juicer.analysis.measure) eoc (data_juicer.utils.mm_utils.SpecialTokens attribute) event_description (data_juicer.utils.constant.MetaKeys attribute) EventDrivenMixin (class in data_juicer.ops.mixins) execute_and_probe() (data_juicer.core.Adapter static method) (data_juicer.core.adapter.Adapter static method) executor_type (data_juicer.core.data.load_strategy.StrategyKey attribute) ExecutorBase (class in data_juicer.core) (class in data_juicer.core.executor) (class in data_juicer.core.executor.base) ExecutorFactory (class in data_juicer.core) (class in data_juicer.core.executor) (class in data_juicer.core.executor.factory) expand_outdir_and_mkdir() (in module data_juicer.utils.file_utils) ExpandMacroMapper (class in data_juicer.ops.mapper.expand_macro_mapper) export() (data_juicer.core.Exporter method) (data_juicer.core.exporter.Exporter method) export_compute_stats() (data_juicer.core.Exporter method) (data_juicer.core.exporter.Exporter method) export_config() (in module data_juicer.config) (in module data_juicer.config.config) Exporter (class in data_juicer.core) (class in data_juicer.core.exporter) ExpressionTransformer (class in data_juicer.ops.filter.general_field_filter) extra_configs (data_juicer.utils.constant.JobRequiredKeys attribute) extract() (data_juicer.download.downloader.DocumentExtractor method) (data_juicer.download.wikipedia.WikipediaExtractor method) (data_juicer.utils.compress.Extractor class method) extract_audio_from_video() (in module data_juicer.utils.mm_utils) extract_key_frames() (in module data_juicer.utils.mm_utils) extract_key_frames_by_seconds() (in module data_juicer.utils.mm_utils) extract_txt_from_docx() (in module data_juicer.format.text_formatter) extract_txt_from_pdf() (in module data_juicer.format.text_formatter) extract_video_frames_uniformly() (in module data_juicer.utils.mm_utils) extract_video_frames_uniformly_by_seconds() (in module data_juicer.utils.mm_utils) ExtractEntityAttributeMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.extract_entity_attribute_mapper) ExtractEntityRelationMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.extract_entity_relation_mapper) ExtractEventMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.extract_event_mapper) ExtractKeywordMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.extract_keyword_mapper) ExtractNicknameMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.extract_nickname_mapper) Extractor (class in data_juicer.utils.compress) ExtractSupportTextMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.extract_support_text_mapper) ExtractTablesFromHtmlMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.extract_tables_from_html_mapper) F face_counts (data_juicer.utils.constant.StatsKeysConstant attribute) face_detections (data_juicer.utils.constant.StatsKeysConstant attribute) face_ratios (data_juicer.utils.constant.StatsKeysConstant attribute) Fields (class in data_juicer.utils.constant) FileLock (class in data_juicer.utils.compress) fileno() (data_juicer.utils.logger_utils.StreamToLoguru method) Filter (class in data_juicer.ops) (class in data_juicer.ops.base_op) filter() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) filter_batch() (in module data_juicer.core.data.ray_dataset) filter_with_union_find() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) find() (data_juicer.ops.common.helper_func.UnionFind method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) find_corresponding_test_file() (in module data_juicer.utils.unittest_utils) find_files_with_suffix() (in module data_juicer.utils.file_utils) find_noun_phrases() (in module data_juicer.ops.filter.phrase_grounding_recall_filter) find_root_verb_and_its_dobj() (in module data_juicer.analysis.diversity_analysis) find_root_verb_and_its_dobj_in_string() (in module data_juicer.analysis.diversity_analysis) FixUnicodeMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.fix_unicode_mapper) flagged_words_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) FlaggedWordFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.flagged_words_filter) flush() (data_juicer.utils.logger_utils.StreamToLoguru method) flush_key_value_pairs() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) follow_read() (in module data_juicer.utils.file_utils) format_cache_file_name() (data_juicer.utils.compress.CacheCompressManager method) forward() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) (data_juicer.ops.common.prompt2prompt_pipeline.EmptyControl method) free_models() (in module data_juicer.utils.model_utils) FrequencySpecifiedFieldSelector (class in data_juicer.ops.selector) (class in data_juicer.ops.selector.frequency_specified_field_selector) from_dict() (data_juicer.core.data.dj_dataset.NestedDataset class method) (data_juicer.core.data.NestedDataset class method) (data_juicer.core.NestedDataset class method) fuse_filter_group() (in module data_juicer.ops.op_fusion) fuse_operators() (in module data_juicer.ops.op_fusion) FusedFilter (class in data_juicer.ops.op_fusion) G general_field_filter_condition (data_juicer.utils.constant.StatsKeysConstant attribute) GeneralFieldFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.general_field_filter) GeneralFusedOP (class in data_juicer.ops.op_fusion) generate_dataset() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase method) generate_fingerprint() (in module data_juicer.utils.fingerprint_utils) GenerateQAFromExamplesMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.generate_qa_from_examples_mapper) GenerateQAFromTextMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.generate_qa_from_text_mapper) generic_visit() (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) get() (data_juicer.core.data.dj_dataset.DJDataset method) (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.DJDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.data.ray_dataset.RayDataset method) (data_juicer.core.NestedDataset method) (data_juicer.utils.registry.Registry method) get_abs_path() (in module data_juicer.core.data.ray_dataset) get_access_log() (data_juicer.utils.constant.StatsKeysMeta method) get_aligned_sequences() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_all_annotations() (data_juicer.ops.mapper.annotation.annotation_mapper.LabelStudioAnnotationMapper method) get_all_dependencies() (data_juicer.utils.lazy_loader.LazyLoader class method) get_all_files_paths_under() (in module data_juicer.utils.file_utils) get_arxiv_urls() (in module data_juicer.download.downloader) get_average_attention() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) get_backup_model_link() (in module data_juicer.utils.model_utils) get_caller_name() (in module data_juicer.utils.logger_utils) get_column() (data_juicer.core.data.dj_dataset.DJDataset method) (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.DJDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.data.ray_dataset.RayDataset method) (data_juicer.core.NestedDataset method) get_cpu_count() (in module data_juicer.utils.resource_utils) get_cpu_utilization() (in module data_juicer.utils.resource_utils) get_decoded_frames_from_video() (in module data_juicer.utils.mm_utils) get_default_cfg() (in module data_juicer.config) (in module data_juicer.config.config) get_diff_files() (in module data_juicer.utils.unittest_utils) get_diversity() (in module data_juicer.analysis.diversity_analysis) get_edges() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) get_empty_store() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore static method) get_equalizer() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_file_size() (in module data_juicer.utils.mm_utils) get_hash_method() (in module data_juicer.ops.deduplicator.image_deduplicator) (in module data_juicer.ops.deduplicator.ray_image_deduplicator) get_init_configs() (in module data_juicer.config) (in module data_juicer.config.config) get_key_frame_seconds() (in module data_juicer.utils.mm_utils) get_left_process_list() (data_juicer.utils.ckpt_utils.CheckpointManager method) get_log_file_path() (in module data_juicer.utils.logger_utils) get_mapper() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_matrix() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_min_cuda_memory() (in module data_juicer.utils.process_utils) get_model() (in module data_juicer.utils.model_utils) get_next_id() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.IdGenerator method) get_num_gpus() (in module data_juicer.core.data.ray_dataset) get_package_name() (data_juicer.utils.lazy_loader.LazyLoader class method) get_partial_test_cases() (in module data_juicer.utils.unittest_utils) get_reader() (data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method) (data_juicer.ops.filter.VideoOcrAreaRatioFilter method) get_refinement_mapper() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_remote_classes() (in module data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) get_remote_dedup_set() (in module data_juicer.ops.deduplicator.ray_basic_deduplicator) get_replacement_mapper() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_replacement_mapper_() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_row_col() (in module data_juicer.analysis.column_wise_analysis) get_sample_numbers() (in module data_juicer.core.data.dataset_builder) get_sentences_from_document() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) get_special_tokens() (in module data_juicer.utils.mm_utils) get_split_key_frame() (data_juicer.ops.mapper.video_split_by_key_frame_mapper.VideoSplitByKeyFrameMapper method) (data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method) get_strategy_class() (data_juicer.core.data.load_strategy.DataLoadStrategyRegistry class method) get_text_chunks() (data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method) (data_juicer.ops.mapper.TextChunkMapper method) get_time_words_attention_alpha() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_toml_file_path() (in module data_juicer.utils.lazy_loader) get_traceback_matrix() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_uv_lock_path() (in module data_juicer.utils.lazy_loader) get_validator() (data_juicer.core.data.data_validator.DataValidatorRegistry class method) get_video_duration() (in module data_juicer.utils.mm_utils) get_wikipedia_urls() (in module data_juicer.download.downloader) get_word_inds() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_words_from_document() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) getvalue() (data_juicer.utils.logger_utils.StreamToLoguru method) GiB (data_juicer.core.Exporter attribute) (data_juicer.core.exporter.Exporter attribute) global_align() (in module data_juicer.ops.common.prompt2prompt_pipeline) Grouper (class in data_juicer.ops) (class in data_juicer.ops.base_op) GzipCompressor (class in data_juicer.utils.compress) H hash (data_juicer.utils.constant.HashKeys attribute) hash() (data_juicer.utils.fingerprint_utils.Hasher class method) hash_bytes() (data_juicer.utils.fingerprint_utils.Hasher class method) hash_default() (data_juicer.utils.fingerprint_utils.Hasher class method) Hasher (class in data_juicer.utils.fingerprint_utils) HashKeys (class in data_juicer.utils.constant) hexdigest() (data_juicer.utils.fingerprint_utils.Hasher method) HiddenPrints (class in data_juicer.utils.logger_utils) hook (data_juicer.utils.constant.JobRequiredKeys attribute) html_tables (data_juicer.utils.constant.MetaKeys attribute) HumanPreferenceAnnotationMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.annotation.human_preference_annotation_mapper) I IdGenerator (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) image (data_juicer.utils.mm_utils.SpecialTokens attribute) image_aesthetics_scores (data_juicer.utils.constant.StatsKeysConstant attribute) image_byte_to_base64() (in module data_juicer.utils.mm_utils) image_height (data_juicer.utils.constant.StatsKeysConstant attribute) image_nsfw_score (data_juicer.utils.constant.StatsKeysConstant attribute) image_pair_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) image_path_to_base64() (in module data_juicer.utils.mm_utils) image_sizes (data_juicer.utils.constant.StatsKeysConstant attribute) image_tags (data_juicer.utils.constant.MetaKeys attribute) image_text_matching_score (data_juicer.utils.constant.StatsKeysConstant attribute) image_text_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) image_watermark_prob (data_juicer.utils.constant.StatsKeysConstant attribute) image_width (data_juicer.utils.constant.StatsKeysConstant attribute) ImageAestheticsFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_aesthetics_filter) ImageAspectRatioFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_aspect_ratio_filter) ImageBlurMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_blur_mapper) ImageCaptioningFromGPT4VMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper) ImageCaptioningMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_captioning_mapper) ImageDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.image_deduplicator) ImageDiffusionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_diffusion_mapper) ImageFaceBlurMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_face_blur_mapper) ImageFaceCountFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_face_count_filter) ImageFaceRatioFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_face_ratio_filter) imagehash (data_juicer.utils.constant.HashKeys attribute) ImageNSFWFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_nsfw_filter) ImagePairSimilarityFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_pair_similarity_filter) ImageRemoveBackgroundMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_remove_background_mapper) ImageSegmentMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_segment_mapper) ImageShapeFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_shape_filter) ImageSizeFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_size_filter) ImageTaggingMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.image_tagging_mapper) ImageTextMatchingFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_text_matching_filter) ImageTextSimilarityFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_text_similarity_filter) ImageWatermarkFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.image_watermark_filter) init_configs() (in module data_juicer.config) (in module data_juicer.config.config) init_setup_from_cfg() (in module data_juicer.config.config) insert_texts_after_placeholders() (in module data_juicer.utils.mm_utils) insight_mining() (data_juicer.core.Adapter method) (data_juicer.core.adapter.Adapter method) InterVars (class in data_juicer.utils.constant) iou() (in module data_juicer.utils.mm_utils) iou_filter() (in module data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper) is_absolute_path() (in module data_juicer.utils.file_utils) is_batched_op() (data_juicer.ops.base_op.OP method) is_cuda_available() (in module data_juicer) is_float() (in module data_juicer.utils.common_utils) is_noun() (in module data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper) is_number() (in module data_juicer.ops.filter.specified_numeric_field_filter) is_string_list() (in module data_juicer.utils.common_utils) is_unique (data_juicer.utils.constant.HashKeys attribute) is_unique() (data_juicer.ops.deduplicator.ray_basic_deduplicator.ActorBackend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.Backend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.DedupSet method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RedisBackend method) isatty() (data_juicer.utils.logger_utils.StreamToLoguru method) iterate() (data_juicer.download.downloader.DocumentIterator method) (data_juicer.download.wikipedia.WikipediaIterator method) J JobRequiredKeys (class in data_juicer.utils.constant) JSDivMeasure (class in data_juicer.analysis.measure) JsonFormatter (class in data_juicer.format) (class in data_juicer.format.json_formatter) JSONStreamDatasource (class in data_juicer.core.data.ray_dataset) K KeyValueGrouper (class in data_juicer.ops.grouper) (class in data_juicer.ops.grouper.key_value_grouper) keyword (data_juicer.utils.constant.MetaKeys attribute) KiB (data_juicer.core.Exporter attribute) (data_juicer.core.exporter.Exporter attribute) KLDivMeasure (class in data_juicer.analysis.measure) L LabelStudioAnnotationMapper (class in data_juicer.ops.mapper.annotation.annotation_mapper) lang (data_juicer.utils.constant.StatsKeysConstant attribute) lang_score (data_juicer.utils.constant.StatsKeysConstant attribute) LanguageIDScoreFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.language_id_score_filter) LazyLoader (class in data_juicer.utils.lazy_loader) light_rag_extraction() (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.ExtractEntityRelationMapper method) lines (data_juicer.utils.constant.InterVars attribute) list() (data_juicer.utils.registry.Registry method) llm_difficulty_record (data_juicer.utils.constant.StatsKeysConstant attribute) llm_difficulty_score (data_juicer.utils.constant.StatsKeysConstant attribute) llm_quality_record (data_juicer.utils.constant.StatsKeysConstant attribute) llm_quality_score (data_juicer.utils.constant.StatsKeysConstant attribute) LLMDifficultyScoreFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.llm_difficulty_score_filter) LLMQualityScoreFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.llm_quality_score_filter) load_audio() (in module data_juicer.utils.mm_utils) load_audios() (in module data_juicer.utils.mm_utils) load_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method) load_data() (data_juicer.core.data.load_strategy.DataLoadStrategy method) (data_juicer.core.data.load_strategy.DefaultArxivDataLoadStrategy method) (data_juicer.core.data.load_strategy.DefaultCommonCrawlDataLoadStrategy method) (data_juicer.core.data.load_strategy.DefaultDataLoadStrategy method) (data_juicer.core.data.load_strategy.DefaultHuggingfaceDataLoadStrategy method) (data_juicer.core.data.load_strategy.DefaultLocalDataLoadStrategy method) (data_juicer.core.data.load_strategy.DefaultModelScopeDataLoadStrategy method) (data_juicer.core.data.load_strategy.DefaultWikiDataLoadStrategy method) (data_juicer.core.data.load_strategy.RayDataLoadStrategy method) (data_juicer.core.data.load_strategy.RayHuggingfaceDataLoadStrategy method) (data_juicer.core.data.load_strategy.RayLocalJsonDataLoadStrategy method) load_data_with_context() (in module data_juicer.utils.mm_utils) load_dataset() (data_juicer.core.data.dataset_builder.DatasetBuilder method) (data_juicer.format.empty_formatter.EmptyFormatter method) (data_juicer.format.empty_formatter.RayEmptyFormatter method) (data_juicer.format.EmptyFormatter method) (data_juicer.format.formatter.BaseFormatter method) (data_juicer.format.formatter.LocalFormatter method) (data_juicer.format.formatter.RemoteFormatter method) (data_juicer.format.LocalFormatter method) (data_juicer.format.RayEmptyFormatter method) (data_juicer.format.RemoteFormatter method) (data_juicer.format.text_formatter.TextFormatter method) (data_juicer.format.TextFormatter method) load_dataset_by_generated_config() (data_juicer.core.data.dataset_builder.DatasetBuilder class method) load_formatter() (in module data_juicer.format.load) load_from_disk() (data_juicer.core.data.dj_dataset.NestedDataset static method) (data_juicer.core.data.NestedDataset static method) (data_juicer.core.NestedDataset static method) load_image() (in module data_juicer.utils.mm_utils) load_image_byte() (in module data_juicer.utils.mm_utils) load_images() (in module data_juicer.utils.mm_utils) load_images_byte() (in module data_juicer.utils.mm_utils) load_ops() (in module data_juicer.ops) (in module data_juicer.ops.load) load_ops_with_stats_meta() (in module data_juicer.config.config) load_video() (in module data_juicer.utils.mm_utils) load_videos() (in module data_juicer.utils.mm_utils) load_words_asset() (in module data_juicer.utils.asset_utils) loaded_audios (data_juicer.utils.constant.InterVars attribute) loaded_images (data_juicer.utils.constant.InterVars attribute) loaded_videos (data_juicer.utils.constant.InterVars attribute) LocalBlend (class in data_juicer.ops.common.prompt2prompt_pipeline) LocalFormatter (class in data_juicer.format) (class in data_juicer.format.formatter) Lz4Compressor (class in data_juicer.utils.compress) M main_entities (data_juicer.utils.constant.MetaKeys attribute) make_log_summarization() (in module data_juicer.utils.logger_utils) map() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.dj_dataset.NestedDatasetDict method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) map_hf_type_to_python() (data_juicer.core.data.schema.Schema class method) map_ray_type_to_python() (data_juicer.core.data.schema.Schema class method) Mapper (class in data_juicer.ops) (class in data_juicer.ops.base_op) matches() (data_juicer.core.data.load_strategy.StrategyKey method) MAX_BATCH_SIZE (data_juicer.core.Adapter attribute) (data_juicer.core.adapter.Adapter attribute) max_line_length (data_juicer.utils.constant.StatsKeysConstant attribute) MaximumLineLengthFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.maximum_line_length_filter) Measure (class in data_juicer.analysis.measure) measure() (data_juicer.analysis.measure.CrossEntropyMeasure method) (data_juicer.analysis.measure.EntropyMeasure method) (data_juicer.analysis.measure.JSDivMeasure method) (data_juicer.analysis.measure.KLDivMeasure method) (data_juicer.analysis.measure.Measure method) (data_juicer.analysis.measure.RelatedTTestMeasure method) merge() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) merge_config() (in module data_juicer.config) (in module data_juicer.config.config) merge_on_whitespace_tab_newline() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) merge_op_batch() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) meta (data_juicer.utils.constant.Fields attribute) meta_map() (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) meta_name (data_juicer.utils.constant.JobRequiredKeys attribute) MetaKeys (class in data_juicer.utils.constant) MetaTagsAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.meta_tags_aggregator) MiB (data_juicer.core.Exporter attribute) (data_juicer.core.exporter.Exporter attribute) minhash (data_juicer.utils.constant.HashKeys attribute) mis_match_char() (data_juicer.ops.common.prompt2prompt_pipeline.ScoreParams method) MllmMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.mllm_mapper) module data_juicer data_juicer.analysis data_juicer.analysis.collector data_juicer.analysis.column_wise_analysis data_juicer.analysis.diversity_analysis data_juicer.analysis.draw data_juicer.analysis.measure data_juicer.analysis.overall_analysis data_juicer.config data_juicer.config.config data_juicer.core data_juicer.core.adapter data_juicer.core.analyzer data_juicer.core.data data_juicer.core.data.config_validator data_juicer.core.data.data_validator data_juicer.core.data.dataset_builder data_juicer.core.data.dj_dataset data_juicer.core.data.load_strategy data_juicer.core.data.ray_dataset data_juicer.core.data.schema data_juicer.core.executor data_juicer.core.executor.base data_juicer.core.executor.default_executor data_juicer.core.executor.factory data_juicer.core.executor.ray_executor data_juicer.core.exporter data_juicer.core.monitor data_juicer.core.tracer data_juicer.download data_juicer.download.commoncrawl data_juicer.download.downloader data_juicer.download.wikipedia data_juicer.format data_juicer.format.csv_formatter data_juicer.format.empty_formatter data_juicer.format.formatter data_juicer.format.json_formatter data_juicer.format.load data_juicer.format.parquet_formatter data_juicer.format.text_formatter data_juicer.format.tsv_formatter data_juicer.ops data_juicer.ops.aggregator data_juicer.ops.aggregator.entity_attribute_aggregator data_juicer.ops.aggregator.meta_tags_aggregator data_juicer.ops.aggregator.most_relevant_entities_aggregator data_juicer.ops.aggregator.nested_aggregator data_juicer.ops.base_op data_juicer.ops.common data_juicer.ops.common.helper_func data_juicer.ops.common.prompt2prompt_pipeline data_juicer.ops.common.special_characters data_juicer.ops.deduplicator data_juicer.ops.deduplicator.document_deduplicator data_juicer.ops.deduplicator.document_minhash_deduplicator data_juicer.ops.deduplicator.document_simhash_deduplicator data_juicer.ops.deduplicator.image_deduplicator data_juicer.ops.deduplicator.ray_basic_deduplicator data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator data_juicer.ops.deduplicator.ray_document_deduplicator data_juicer.ops.deduplicator.ray_image_deduplicator data_juicer.ops.deduplicator.ray_video_deduplicator data_juicer.ops.deduplicator.video_deduplicator data_juicer.ops.filter data_juicer.ops.filter.alphanumeric_filter data_juicer.ops.filter.audio_duration_filter data_juicer.ops.filter.audio_nmf_snr_filter data_juicer.ops.filter.audio_size_filter data_juicer.ops.filter.average_line_length_filter data_juicer.ops.filter.character_repetition_filter data_juicer.ops.filter.flagged_words_filter data_juicer.ops.filter.general_field_filter data_juicer.ops.filter.image_aesthetics_filter data_juicer.ops.filter.image_aspect_ratio_filter data_juicer.ops.filter.image_face_count_filter data_juicer.ops.filter.image_face_ratio_filter data_juicer.ops.filter.image_nsfw_filter data_juicer.ops.filter.image_pair_similarity_filter data_juicer.ops.filter.image_shape_filter data_juicer.ops.filter.image_size_filter data_juicer.ops.filter.image_text_matching_filter data_juicer.ops.filter.image_text_similarity_filter data_juicer.ops.filter.image_watermark_filter data_juicer.ops.filter.language_id_score_filter data_juicer.ops.filter.llm_difficulty_score_filter data_juicer.ops.filter.llm_quality_score_filter data_juicer.ops.filter.maximum_line_length_filter data_juicer.ops.filter.perplexity_filter data_juicer.ops.filter.phrase_grounding_recall_filter data_juicer.ops.filter.special_characters_filter data_juicer.ops.filter.specified_field_filter data_juicer.ops.filter.specified_numeric_field_filter data_juicer.ops.filter.stopwords_filter data_juicer.ops.filter.suffix_filter data_juicer.ops.filter.text_action_filter data_juicer.ops.filter.text_entity_dependency_filter data_juicer.ops.filter.text_length_filter data_juicer.ops.filter.text_pair_similarity_filter data_juicer.ops.filter.token_num_filter data_juicer.ops.filter.video_aesthetics_filter data_juicer.ops.filter.video_aspect_ratio_filter data_juicer.ops.filter.video_duration_filter data_juicer.ops.filter.video_frames_text_similarity_filter data_juicer.ops.filter.video_motion_score_filter data_juicer.ops.filter.video_motion_score_raft_filter data_juicer.ops.filter.video_nsfw_filter data_juicer.ops.filter.video_ocr_area_ratio_filter data_juicer.ops.filter.video_resolution_filter data_juicer.ops.filter.video_tagging_from_frames_filter data_juicer.ops.filter.video_watermark_filter data_juicer.ops.filter.word_repetition_filter data_juicer.ops.filter.words_num_filter data_juicer.ops.grouper data_juicer.ops.grouper.key_value_grouper data_juicer.ops.grouper.naive_grouper data_juicer.ops.grouper.naive_reverse_grouper data_juicer.ops.load data_juicer.ops.mapper data_juicer.ops.mapper.annotation data_juicer.ops.mapper.annotation.annotation_mapper data_juicer.ops.mapper.annotation.human_preference_annotation_mapper data_juicer.ops.mapper.audio_add_gaussian_noise_mapper data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper data_juicer.ops.mapper.calibrate_qa_mapper data_juicer.ops.mapper.calibrate_query_mapper data_juicer.ops.mapper.calibrate_response_mapper data_juicer.ops.mapper.chinese_convert_mapper data_juicer.ops.mapper.clean_copyright_mapper data_juicer.ops.mapper.clean_email_mapper data_juicer.ops.mapper.clean_html_mapper data_juicer.ops.mapper.clean_ip_mapper data_juicer.ops.mapper.clean_links_mapper data_juicer.ops.mapper.dialog_intent_detection_mapper data_juicer.ops.mapper.dialog_sentiment_detection_mapper data_juicer.ops.mapper.dialog_sentiment_intensity_mapper data_juicer.ops.mapper.dialog_topic_detection_mapper data_juicer.ops.mapper.expand_macro_mapper data_juicer.ops.mapper.extract_entity_attribute_mapper data_juicer.ops.mapper.extract_entity_relation_mapper data_juicer.ops.mapper.extract_event_mapper data_juicer.ops.mapper.extract_keyword_mapper data_juicer.ops.mapper.extract_nickname_mapper data_juicer.ops.mapper.extract_support_text_mapper data_juicer.ops.mapper.extract_tables_from_html_mapper data_juicer.ops.mapper.fix_unicode_mapper data_juicer.ops.mapper.generate_qa_from_examples_mapper data_juicer.ops.mapper.generate_qa_from_text_mapper data_juicer.ops.mapper.image_blur_mapper data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper data_juicer.ops.mapper.image_captioning_mapper data_juicer.ops.mapper.image_diffusion_mapper data_juicer.ops.mapper.image_face_blur_mapper data_juicer.ops.mapper.image_remove_background_mapper data_juicer.ops.mapper.image_segment_mapper data_juicer.ops.mapper.image_tagging_mapper data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper data_juicer.ops.mapper.imgdiff_difference_caption_generator_mapper data_juicer.ops.mapper.mllm_mapper data_juicer.ops.mapper.nlpaug_en_mapper data_juicer.ops.mapper.nlpcda_zh_mapper data_juicer.ops.mapper.optimize_qa_mapper data_juicer.ops.mapper.optimize_query_mapper data_juicer.ops.mapper.optimize_response_mapper data_juicer.ops.mapper.pair_preference_mapper data_juicer.ops.mapper.punctuation_normalization_mapper data_juicer.ops.mapper.python_file_mapper data_juicer.ops.mapper.python_lambda_mapper data_juicer.ops.mapper.query_intent_detection_mapper data_juicer.ops.mapper.query_sentiment_detection_mapper data_juicer.ops.mapper.query_topic_detection_mapper data_juicer.ops.mapper.relation_identity_mapper data_juicer.ops.mapper.remove_bibliography_mapper data_juicer.ops.mapper.remove_comments_mapper data_juicer.ops.mapper.remove_header_mapper data_juicer.ops.mapper.remove_long_words_mapper data_juicer.ops.mapper.remove_non_chinese_character_mapper data_juicer.ops.mapper.remove_repeat_sentences_mapper data_juicer.ops.mapper.remove_specific_chars_mapper data_juicer.ops.mapper.remove_table_text_mapper data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper data_juicer.ops.mapper.replace_content_mapper data_juicer.ops.mapper.sdxl_prompt2prompt_mapper data_juicer.ops.mapper.sentence_augmentation_mapper data_juicer.ops.mapper.sentence_split_mapper data_juicer.ops.mapper.text_chunk_mapper data_juicer.ops.mapper.video_captioning_from_audio_mapper data_juicer.ops.mapper.video_captioning_from_frames_mapper data_juicer.ops.mapper.video_captioning_from_summarizer_mapper data_juicer.ops.mapper.video_captioning_from_video_mapper data_juicer.ops.mapper.video_extract_frames_mapper data_juicer.ops.mapper.video_face_blur_mapper data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper data_juicer.ops.mapper.video_remove_watermark_mapper data_juicer.ops.mapper.video_resize_aspect_ratio_mapper data_juicer.ops.mapper.video_resize_resolution_mapper data_juicer.ops.mapper.video_split_by_duration_mapper data_juicer.ops.mapper.video_split_by_key_frame_mapper data_juicer.ops.mapper.video_split_by_scene_mapper data_juicer.ops.mapper.video_tagging_from_audio_mapper data_juicer.ops.mapper.video_tagging_from_frames_mapper data_juicer.ops.mapper.whitespace_normalization_mapper data_juicer.ops.mixins data_juicer.ops.op_fusion data_juicer.ops.selector data_juicer.ops.selector.frequency_specified_field_selector data_juicer.ops.selector.random_selector data_juicer.ops.selector.range_specified_field_selector data_juicer.ops.selector.tags_specified_field_selector data_juicer.ops.selector.topk_specified_field_selector data_juicer.tools data_juicer.utils data_juicer.utils.asset_utils data_juicer.utils.availability_utils data_juicer.utils.cache_utils data_juicer.utils.ckpt_utils data_juicer.utils.common_utils data_juicer.utils.compress data_juicer.utils.constant data_juicer.utils.file_utils data_juicer.utils.fingerprint_utils data_juicer.utils.lazy_loader data_juicer.utils.logger_utils data_juicer.utils.mm_utils data_juicer.utils.model_utils data_juicer.utils.nltk_utils data_juicer.utils.process_utils data_juicer.utils.registry data_juicer.utils.resource_utils data_juicer.utils.sample data_juicer.utils.unittest_utils modules (data_juicer.utils.registry.Registry property) Monitor (class in data_juicer.core) (class in data_juicer.core.monitor) monitor_all_resources() (data_juicer.core.Monitor method) (data_juicer.core.monitor.Monitor method) monitor_current_resources() (data_juicer.core.Monitor static method) (data_juicer.core.monitor.Monitor static method) monitor_func() (data_juicer.core.Monitor static method) (data_juicer.core.monitor.Monitor static method) most_relevant_entities (data_juicer.utils.constant.BatchMetaKeys attribute) MostRelevantEntitiesAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.most_relevant_entities_aggregator) multimodal_data_output_dir (data_juicer.utils.constant.Fields attribute) N NaiveGrouper (class in data_juicer.ops.grouper) (class in data_juicer.ops.grouper.naive_grouper) NaiveReverseGrouper (class in data_juicer.ops.grouper) (class in data_juicer.ops.grouper.naive_reverse_grouper) name (data_juicer.analysis.measure.CrossEntropyMeasure attribute) (data_juicer.analysis.measure.EntropyMeasure attribute) (data_juicer.analysis.measure.JSDivMeasure attribute) (data_juicer.analysis.measure.KLDivMeasure attribute) (data_juicer.analysis.measure.Measure attribute) (data_juicer.analysis.measure.RelatedTTestMeasure attribute) (data_juicer.utils.registry.Registry property) namespace_to_arg_list() (in module data_juicer.config.config) nested_access() (in module data_juicer.utils.common_utils) nested_obj_factory() (in module data_juicer.core.data.dj_dataset) nested_query() (in module data_juicer.core.data.dj_dataset) NestedAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.nested_aggregator) NestedDataset (class in data_juicer.core) (class in data_juicer.core.data) (class in data_juicer.core.data.dj_dataset) NestedDatasetDict (class in data_juicer.core.data.dj_dataset) NestedQueryDict (class in data_juicer.core.data.dj_dataset) nickname (data_juicer.utils.constant.MetaKeys attribute) NlpaugEnMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.nlpaug_en_mapper) NlpcdaZhMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.nlpcda_zh_mapper) NotificationMixin (class in data_juicer.ops.mixins) null_value (data_juicer.format.empty_formatter.EmptyFormatter property) (data_juicer.format.empty_formatter.RayEmptyFormatter property) (data_juicer.format.EmptyFormatter property) (data_juicer.format.RayEmptyFormatter property) num_action (data_juicer.utils.constant.StatsKeysConstant attribute) num_dependency_edges (data_juicer.utils.constant.StatsKeysConstant attribute) num_token (data_juicer.utils.constant.StatsKeysConstant attribute) num_uncond_att_layers (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl property) num_words (data_juicer.utils.constant.StatsKeysConstant attribute) O OP (class in data_juicer.ops.base_op) optimal_param() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator) OptimizeQAMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.optimize_qa_mapper) OptimizeQueryMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.optimize_query_mapper) OptimizeResponseMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.optimize_response_mapper) OverallAnalysis (class in data_juicer.analysis) (class in data_juicer.analysis.overall_analysis) P P2PCrossAttnProcessor (class in data_juicer.ops.common.prompt2prompt_pipeline) PairPreferenceMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.pair_preference_mapper) ParquetFormatter (class in data_juicer.format) (class in data_juicer.format.parquet_formatter) parse_cli_datapath() (in module data_juicer.core.data.dataset_builder) parse_output() (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.LLMQualityScoreFilter method) (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.calibrate_query_mapper.CalibrateQueryMapper method) (data_juicer.ops.mapper.calibrate_response_mapper.CalibrateResponseMapper method) (data_juicer.ops.mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.CalibrateQueryMapper method) (data_juicer.ops.mapper.CalibrateResponseMapper method) (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper method) (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper method) (data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper method) (data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper method) (data_juicer.ops.mapper.ExtractEntityAttributeMapper method) (data_juicer.ops.mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.ExtractEventMapper method) (data_juicer.ops.mapper.ExtractKeywordMapper method) (data_juicer.ops.mapper.ExtractNicknameMapper method) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.generate_qa_from_text_mapper.GenerateQAFromTextMapper method) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.GenerateQAFromTextMapper method) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.optimize_query_mapper.OptimizeQueryMapper method) (data_juicer.ops.mapper.optimize_response_mapper.OptimizeResponseMapper method) (data_juicer.ops.mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.OptimizeQueryMapper method) (data_juicer.ops.mapper.OptimizeResponseMapper method) (data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method) (data_juicer.ops.mapper.PairPreferenceMapper method) (data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper method) (data_juicer.ops.mapper.RelationIdentityMapper method) parse_string_to_roi() (in module data_juicer.utils.mm_utils) patch_nltk_pickle_security() (in module data_juicer.utils.nltk_utils) perplexity (data_juicer.utils.constant.StatsKeysConstant attribute) PerplexityFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.perplexity_filter) phrase_grounding_recall (data_juicer.utils.constant.StatsKeysConstant attribute) PhraseGroundingRecallFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.phrase_grounding_recall_filter) pil_to_opencv() (in module data_juicer.utils.mm_utils) prepare_api_model() (in module data_juicer.utils.model_utils) prepare_converter() (in module data_juicer.ops.mapper.chinese_convert_mapper) prepare_diffusion_model() (in module data_juicer.utils.model_utils) prepare_fastsam_model() (in module data_juicer.utils.model_utils) prepare_fasttext_model() (in module data_juicer.utils.model_utils) prepare_huggingface_model() (in module data_juicer.utils.model_utils) prepare_kenlm_model() (in module data_juicer.utils.model_utils) prepare_model() (in module data_juicer.utils.model_utils) prepare_nltk_model() (in module data_juicer.utils.model_utils) prepare_nltk_pos_tagger() (in module data_juicer.utils.model_utils) prepare_opencv_classifier() (in module data_juicer.utils.model_utils) prepare_recognizeAnything_model() (in module data_juicer.utils.model_utils) prepare_sdxl_prompt2prompt() (in module data_juicer.utils.model_utils) prepare_sentencepiece_for_lang() (in module data_juicer.utils.model_utils) prepare_sentencepiece_model() (in module data_juicer.utils.model_utils) prepare_side_configs() (in module data_juicer.config) (in module data_juicer.config.config) prepare_simple_aesthetics_model() (in module data_juicer.utils.model_utils) prepare_spacy_model() (in module data_juicer.utils.model_utils) prepare_video_blip_model() (in module data_juicer.utils.model_utils) prepare_vllm_model() (in module data_juicer.utils.model_utils) preprocess_dataset() (in module data_juicer.core.data.ray_dataset) probe_small_batch() (data_juicer.core.Adapter method) (data_juicer.core.adapter.Adapter method) process() (data_juicer.core.data.dj_dataset.DJDataset method) (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.DJDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.data.ray_dataset.RayDataset method) (data_juicer.core.NestedDataset method) (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.base_op.Grouper method) (data_juicer.ops.base_op.OP method) (data_juicer.ops.base_op.Selector method) (data_juicer.ops.Deduplicator method) (data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method) (data_juicer.ops.deduplicator.VideoDeduplicator method) (data_juicer.ops.Grouper method) (data_juicer.ops.grouper.key_value_grouper.KeyValueGrouper method) (data_juicer.ops.grouper.KeyValueGrouper method) (data_juicer.ops.grouper.naive_grouper.NaiveGrouper method) (data_juicer.ops.grouper.naive_reverse_grouper.NaiveReverseGrouper method) (data_juicer.ops.grouper.NaiveGrouper method) (data_juicer.ops.grouper.NaiveReverseGrouper method) (data_juicer.ops.Selector method) (data_juicer.ops.selector.frequency_specified_field_selector.FrequencySpecifiedFieldSelector method) (data_juicer.ops.selector.FrequencySpecifiedFieldSelector method) (data_juicer.ops.selector.random_selector.RandomSelector method) (data_juicer.ops.selector.RandomSelector method) (data_juicer.ops.selector.range_specified_field_selector.RangeSpecifiedFieldSelector method) (data_juicer.ops.selector.RangeSpecifiedFieldSelector method) (data_juicer.ops.selector.tags_specified_field_selector.TagsSpecifiedFieldSelector method) (data_juicer.ops.selector.TagsSpecifiedFieldSelector method) (data_juicer.ops.selector.topk_specified_field_selector.TopkSpecifiedFieldSelector method) (data_juicer.ops.selector.TopkSpecifiedFieldSelector method) process_batched() (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.Filter method) (data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method) (data_juicer.ops.filter.AlphanumericFilter method) (data_juicer.ops.filter.average_line_length_filter.AverageLineLengthFilter method) (data_juicer.ops.filter.AverageLineLengthFilter method) (data_juicer.ops.filter.character_repetition_filter.CharacterRepetitionFilter method) (data_juicer.ops.filter.CharacterRepetitionFilter method) (data_juicer.ops.filter.flagged_words_filter.FlaggedWordFilter method) (data_juicer.ops.filter.FlaggedWordFilter method) (data_juicer.ops.filter.image_aspect_ratio_filter.ImageAspectRatioFilter method) (data_juicer.ops.filter.ImageAspectRatioFilter method) (data_juicer.ops.filter.maximum_line_length_filter.MaximumLineLengthFilter method) (data_juicer.ops.filter.MaximumLineLengthFilter method) (data_juicer.ops.filter.perplexity_filter.PerplexityFilter method) (data_juicer.ops.filter.PerplexityFilter method) (data_juicer.ops.filter.special_characters_filter.SpecialCharactersFilter method) (data_juicer.ops.filter.SpecialCharactersFilter method) (data_juicer.ops.filter.text_length_filter.TextLengthFilter method) (data_juicer.ops.filter.TextLengthFilter method) (data_juicer.ops.filter.word_repetition_filter.WordRepetitionFilter method) (data_juicer.ops.filter.WordRepetitionFilter method) (data_juicer.ops.filter.words_num_filter.WordsNumFilter method) (data_juicer.ops.filter.WordsNumFilter method) (data_juicer.ops.Mapper method) (data_juicer.ops.mapper.annotation.annotation_mapper.BaseAnnotationMapper method) (data_juicer.ops.mapper.chinese_convert_mapper.ChineseConvertMapper method) (data_juicer.ops.mapper.ChineseConvertMapper method) (data_juicer.ops.mapper.clean_copyright_mapper.CleanCopyrightMapper method) (data_juicer.ops.mapper.clean_email_mapper.CleanEmailMapper method) (data_juicer.ops.mapper.clean_html_mapper.CleanHtmlMapper method) (data_juicer.ops.mapper.clean_ip_mapper.CleanIpMapper method) (data_juicer.ops.mapper.clean_links_mapper.CleanLinksMapper method) (data_juicer.ops.mapper.CleanCopyrightMapper method) (data_juicer.ops.mapper.CleanEmailMapper method) (data_juicer.ops.mapper.CleanHtmlMapper method) (data_juicer.ops.mapper.CleanIpMapper method) (data_juicer.ops.mapper.CleanLinksMapper method) (data_juicer.ops.mapper.expand_macro_mapper.ExpandMacroMapper method) (data_juicer.ops.mapper.extract_event_mapper.ExtractEventMapper method) (data_juicer.ops.mapper.ExtractEventMapper method) (data_juicer.ops.mapper.fix_unicode_mapper.FixUnicodeMapper method) (data_juicer.ops.mapper.FixUnicodeMapper method) (data_juicer.ops.mapper.generate_qa_from_text_mapper.GenerateQAFromTextMapper method) (data_juicer.ops.mapper.GenerateQAFromTextMapper method) (data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper.ImageCaptioningFromGPT4VMapper method) (data_juicer.ops.mapper.image_captioning_mapper.ImageCaptioningMapper method) (data_juicer.ops.mapper.image_diffusion_mapper.ImageDiffusionMapper method) (data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method) (data_juicer.ops.mapper.ImageCaptioningMapper method) (data_juicer.ops.mapper.ImageDiffusionMapper method) (data_juicer.ops.mapper.nlpaug_en_mapper.NlpaugEnMapper method) (data_juicer.ops.mapper.NlpaugEnMapper method) (data_juicer.ops.mapper.nlpcda_zh_mapper.NlpcdaZhMapper method) (data_juicer.ops.mapper.NlpcdaZhMapper method) (data_juicer.ops.mapper.punctuation_normalization_mapper.PunctuationNormalizationMapper method) (data_juicer.ops.mapper.PunctuationNormalizationMapper method) (data_juicer.ops.mapper.python_file_mapper.PythonFileMapper method) (data_juicer.ops.mapper.python_lambda_mapper.PythonLambdaMapper method) (data_juicer.ops.mapper.PythonFileMapper method) (data_juicer.ops.mapper.PythonLambdaMapper method) (data_juicer.ops.mapper.query_intent_detection_mapper.QueryIntentDetectionMapper method) (data_juicer.ops.mapper.query_sentiment_detection_mapper.QuerySentimentDetectionMapper method) (data_juicer.ops.mapper.query_topic_detection_mapper.QueryTopicDetectionMapper method) (data_juicer.ops.mapper.QueryIntentDetectionMapper method) (data_juicer.ops.mapper.QuerySentimentDetectionMapper method) (data_juicer.ops.mapper.QueryTopicDetectionMapper method) (data_juicer.ops.mapper.remove_bibliography_mapper.RemoveBibliographyMapper method) (data_juicer.ops.mapper.remove_comments_mapper.RemoveCommentsMapper method) (data_juicer.ops.mapper.remove_header_mapper.RemoveHeaderMapper method) (data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper method) (data_juicer.ops.mapper.remove_non_chinese_character_mapper.RemoveNonChineseCharacterlMapper method) (data_juicer.ops.mapper.remove_repeat_sentences_mapper.RemoveRepeatSentencesMapper method) (data_juicer.ops.mapper.remove_specific_chars_mapper.RemoveSpecificCharsMapper method) (data_juicer.ops.mapper.remove_table_text_mapper.RemoveTableTextMapper method) (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper method) (data_juicer.ops.mapper.RemoveBibliographyMapper method) (data_juicer.ops.mapper.RemoveCommentsMapper method) (data_juicer.ops.mapper.RemoveHeaderMapper method) (data_juicer.ops.mapper.RemoveLongWordsMapper method) (data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper method) (data_juicer.ops.mapper.RemoveRepeatSentencesMapper method) (data_juicer.ops.mapper.RemoveSpecificCharsMapper method) (data_juicer.ops.mapper.RemoveTableTextMapper method) (data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method) (data_juicer.ops.mapper.replace_content_mapper.ReplaceContentMapper method) (data_juicer.ops.mapper.ReplaceContentMapper method) (data_juicer.ops.mapper.sentence_split_mapper.SentenceSplitMapper method) (data_juicer.ops.mapper.SentenceSplitMapper method) (data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method) (data_juicer.ops.mapper.TextChunkMapper method) (data_juicer.ops.mapper.video_captioning_from_audio_mapper.VideoCaptioningFromAudioMapper method) (data_juicer.ops.mapper.video_captioning_from_frames_mapper.VideoCaptioningFromFramesMapper method) (data_juicer.ops.mapper.video_captioning_from_summarizer_mapper.VideoCaptioningFromSummarizerMapper method) (data_juicer.ops.mapper.video_captioning_from_video_mapper.VideoCaptioningFromVideoMapper method) (data_juicer.ops.mapper.video_split_by_duration_mapper.VideoSplitByDurationMapper method) (data_juicer.ops.mapper.video_split_by_key_frame_mapper.VideoSplitByKeyFrameMapper method) (data_juicer.ops.mapper.VideoCaptioningFromAudioMapper method) (data_juicer.ops.mapper.VideoCaptioningFromFramesMapper method) (data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper method) (data_juicer.ops.mapper.VideoCaptioningFromVideoMapper method) (data_juicer.ops.mapper.VideoSplitByDurationMapper method) (data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method) (data_juicer.ops.mapper.whitespace_normalization_mapper.WhitespaceNormalizationMapper method) (data_juicer.ops.mapper.WhitespaceNormalizationMapper method) (data_juicer.ops.op_fusion.FusedFilter method) (data_juicer.ops.op_fusion.GeneralFusedOP method) process_each_frame() (in module data_juicer.utils.mm_utils) process_single() (data_juicer.ops.Aggregator method) (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) (data_juicer.ops.base_op.Aggregator method) (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.Filter method) (data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method) (data_juicer.ops.filter.audio_nmf_snr_filter.AudioNMFSNRFilter method) (data_juicer.ops.filter.audio_size_filter.AudioSizeFilter method) (data_juicer.ops.filter.AudioDurationFilter method) (data_juicer.ops.filter.AudioNMFSNRFilter method) (data_juicer.ops.filter.AudioSizeFilter method) (data_juicer.ops.filter.general_field_filter.GeneralFieldFilter method) (data_juicer.ops.filter.GeneralFieldFilter method) (data_juicer.ops.filter.image_aesthetics_filter.ImageAestheticsFilter method) (data_juicer.ops.filter.image_face_count_filter.ImageFaceCountFilter method) (data_juicer.ops.filter.image_face_ratio_filter.ImageFaceRatioFilter method) (data_juicer.ops.filter.image_nsfw_filter.ImageNSFWFilter method) (data_juicer.ops.filter.image_pair_similarity_filter.ImagePairSimilarityFilter method) (data_juicer.ops.filter.image_shape_filter.ImageShapeFilter method) (data_juicer.ops.filter.image_size_filter.ImageSizeFilter method) (data_juicer.ops.filter.image_text_matching_filter.ImageTextMatchingFilter method) (data_juicer.ops.filter.image_text_similarity_filter.ImageTextSimilarityFilter method) (data_juicer.ops.filter.image_watermark_filter.ImageWatermarkFilter method) (data_juicer.ops.filter.ImageAestheticsFilter method) (data_juicer.ops.filter.ImageFaceCountFilter method) (data_juicer.ops.filter.ImageFaceRatioFilter method) (data_juicer.ops.filter.ImageNSFWFilter method) (data_juicer.ops.filter.ImagePairSimilarityFilter method) (data_juicer.ops.filter.ImageShapeFilter method) (data_juicer.ops.filter.ImageSizeFilter method) (data_juicer.ops.filter.ImageTextMatchingFilter method) (data_juicer.ops.filter.ImageTextSimilarityFilter method) (data_juicer.ops.filter.ImageWatermarkFilter method) (data_juicer.ops.filter.language_id_score_filter.LanguageIDScoreFilter method) (data_juicer.ops.filter.LanguageIDScoreFilter method) (data_juicer.ops.filter.llm_difficulty_score_filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.llm_quality_score_filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.LLMDifficultyScoreFilter method) (data_juicer.ops.filter.LLMQualityScoreFilter method) (data_juicer.ops.filter.phrase_grounding_recall_filter.PhraseGroundingRecallFilter method) (data_juicer.ops.filter.PhraseGroundingRecallFilter method) (data_juicer.ops.filter.specified_field_filter.SpecifiedFieldFilter method) (data_juicer.ops.filter.specified_numeric_field_filter.SpecifiedNumericFieldFilter method) (data_juicer.ops.filter.SpecifiedFieldFilter method) (data_juicer.ops.filter.SpecifiedNumericFieldFilter method) (data_juicer.ops.filter.stopwords_filter.StopWordsFilter method) (data_juicer.ops.filter.StopWordsFilter method) (data_juicer.ops.filter.suffix_filter.SuffixFilter method) (data_juicer.ops.filter.SuffixFilter method) (data_juicer.ops.filter.text_action_filter.TextActionFilter method) (data_juicer.ops.filter.text_entity_dependency_filter.TextEntityDependencyFilter method) (data_juicer.ops.filter.text_pair_similarity_filter.TextPairSimilarityFilter method) (data_juicer.ops.filter.TextActionFilter method) (data_juicer.ops.filter.TextEntityDependencyFilter method) (data_juicer.ops.filter.TextPairSimilarityFilter method) (data_juicer.ops.filter.token_num_filter.TokenNumFilter method) (data_juicer.ops.filter.TokenNumFilter method) (data_juicer.ops.filter.video_aesthetics_filter.VideoAestheticsFilter method) (data_juicer.ops.filter.video_aspect_ratio_filter.VideoAspectRatioFilter method) (data_juicer.ops.filter.video_duration_filter.VideoDurationFilter method) (data_juicer.ops.filter.video_frames_text_similarity_filter.VideoFramesTextSimilarityFilter method) (data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.video_nsfw_filter.VideoNSFWFilter method) (data_juicer.ops.filter.video_ocr_area_ratio_filter.VideoOcrAreaRatioFilter method) (data_juicer.ops.filter.video_resolution_filter.VideoResolutionFilter method) (data_juicer.ops.filter.video_tagging_from_frames_filter.VideoTaggingFromFramesFilter method) (data_juicer.ops.filter.video_watermark_filter.VideoWatermarkFilter method) (data_juicer.ops.filter.VideoAestheticsFilter method) (data_juicer.ops.filter.VideoAspectRatioFilter method) (data_juicer.ops.filter.VideoDurationFilter method) (data_juicer.ops.filter.VideoFramesTextSimilarityFilter method) (data_juicer.ops.filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.VideoNSFWFilter method) (data_juicer.ops.filter.VideoOcrAreaRatioFilter method) (data_juicer.ops.filter.VideoResolutionFilter method) (data_juicer.ops.filter.VideoTaggingFromFramesFilter method) (data_juicer.ops.filter.VideoWatermarkFilter method) (data_juicer.ops.Mapper method) (data_juicer.ops.mapper.audio_add_gaussian_noise_mapper.AudioAddGaussianNoiseMapper method) (data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper.AudioFFmpegWrappedMapper method) (data_juicer.ops.mapper.AudioAddGaussianNoiseMapper method) (data_juicer.ops.mapper.AudioFFmpegWrappedMapper method) (data_juicer.ops.mapper.calibrate_qa_mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.CalibrateQAMapper method) (data_juicer.ops.mapper.dialog_intent_detection_mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_detection_mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.dialog_sentiment_intensity_mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.dialog_topic_detection_mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.DialogIntentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentDetectionMapper method) (data_juicer.ops.mapper.DialogSentimentIntensityMapper method) (data_juicer.ops.mapper.DialogTopicDetectionMapper method) (data_juicer.ops.mapper.Difference_Area_Generator_Mapper method) (data_juicer.ops.mapper.extract_entity_attribute_mapper.ExtractEntityAttributeMapper method) (data_juicer.ops.mapper.extract_entity_relation_mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.extract_keyword_mapper.ExtractKeywordMapper method) (data_juicer.ops.mapper.extract_nickname_mapper.ExtractNicknameMapper method) (data_juicer.ops.mapper.extract_support_text_mapper.ExtractSupportTextMapper method) (data_juicer.ops.mapper.extract_tables_from_html_mapper.ExtractTablesFromHtmlMapper method) (data_juicer.ops.mapper.ExtractEntityAttributeMapper method) (data_juicer.ops.mapper.ExtractEntityRelationMapper method) (data_juicer.ops.mapper.ExtractKeywordMapper method) (data_juicer.ops.mapper.ExtractNicknameMapper method) (data_juicer.ops.mapper.ExtractSupportTextMapper method) (data_juicer.ops.mapper.ExtractTablesFromHtmlMapper method) (data_juicer.ops.mapper.generate_qa_from_examples_mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method) (data_juicer.ops.mapper.image_blur_mapper.ImageBlurMapper method) (data_juicer.ops.mapper.image_face_blur_mapper.ImageFaceBlurMapper method) (data_juicer.ops.mapper.image_remove_background_mapper.ImageRemoveBackgroundMapper method) (data_juicer.ops.mapper.image_segment_mapper.ImageSegmentMapper method) (data_juicer.ops.mapper.image_tagging_mapper.ImageTaggingMapper method) (data_juicer.ops.mapper.ImageBlurMapper method) (data_juicer.ops.mapper.ImageFaceBlurMapper method) (data_juicer.ops.mapper.ImageRemoveBackgroundMapper method) (data_juicer.ops.mapper.ImageSegmentMapper method) (data_juicer.ops.mapper.ImageTaggingMapper method) (data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper.Difference_Area_Generator_Mapper method) (data_juicer.ops.mapper.imgdiff_difference_caption_generator_mapper.Difference_Caption_Generator_Mapper method) (data_juicer.ops.mapper.mllm_mapper.MllmMapper method) (data_juicer.ops.mapper.MllmMapper method) (data_juicer.ops.mapper.optimize_qa_mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.OptimizeQAMapper method) (data_juicer.ops.mapper.pair_preference_mapper.PairPreferenceMapper method) (data_juicer.ops.mapper.PairPreferenceMapper method) (data_juicer.ops.mapper.python_file_mapper.PythonFileMapper method) (data_juicer.ops.mapper.python_lambda_mapper.PythonLambdaMapper method) (data_juicer.ops.mapper.PythonFileMapper method) (data_juicer.ops.mapper.PythonLambdaMapper method) (data_juicer.ops.mapper.relation_identity_mapper.RelationIdentityMapper method) (data_juicer.ops.mapper.RelationIdentityMapper method) (data_juicer.ops.mapper.sdxl_prompt2prompt_mapper.SDXLPrompt2PromptMapper method) (data_juicer.ops.mapper.SDXLPrompt2PromptMapper method) (data_juicer.ops.mapper.sentence_augmentation_mapper.SentenceAugmentationMapper method) (data_juicer.ops.mapper.SentenceAugmentationMapper method) (data_juicer.ops.mapper.video_extract_frames_mapper.VideoExtractFramesMapper method) (data_juicer.ops.mapper.video_face_blur_mapper.VideoFaceBlurMapper method) (data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper.VideoFFmpegWrappedMapper method) (data_juicer.ops.mapper.video_remove_watermark_mapper.VideoRemoveWatermarkMapper method) (data_juicer.ops.mapper.video_resize_aspect_ratio_mapper.VideoResizeAspectRatioMapper method) (data_juicer.ops.mapper.video_resize_resolution_mapper.VideoResizeResolutionMapper method) (data_juicer.ops.mapper.video_split_by_scene_mapper.VideoSplitBySceneMapper method) (data_juicer.ops.mapper.video_tagging_from_audio_mapper.VideoTaggingFromAudioMapper method) (data_juicer.ops.mapper.video_tagging_from_frames_mapper.VideoTaggingFromFramesMapper method) (data_juicer.ops.mapper.VideoExtractFramesMapper method) (data_juicer.ops.mapper.VideoFaceBlurMapper method) (data_juicer.ops.mapper.VideoFFmpegWrappedMapper method) (data_juicer.ops.mapper.VideoRemoveWatermarkMapper method) (data_juicer.ops.mapper.VideoResizeAspectRatioMapper method) (data_juicer.ops.mapper.VideoResizeResolutionMapper method) (data_juicer.ops.mapper.VideoSplitBySceneMapper method) (data_juicer.ops.mapper.VideoTaggingFromAudioMapper method) (data_juicer.ops.mapper.VideoTaggingFromFramesMapper method) Prompt2PromptPipeline (class in data_juicer.ops.common.prompt2prompt_pipeline) PunctuationNormalizationMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.punctuation_normalization_mapper) PythonFileMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.python_file_mapper) PythonLambdaMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.python_lambda_mapper) Q query_cuda_info() (in module data_juicer.utils.resource_utils) query_intent_label (data_juicer.utils.constant.MetaKeys attribute) query_intent_score (data_juicer.utils.constant.MetaKeys attribute) query_mem_info() (in module data_juicer.utils.resource_utils) query_most_relevant_entities() (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) query_sentiment_label (data_juicer.utils.constant.MetaKeys attribute) query_sentiment_score (data_juicer.utils.constant.MetaKeys attribute) query_topic_label (data_juicer.utils.constant.MetaKeys attribute) query_topic_score (data_juicer.utils.constant.MetaKeys attribute) QueryIntentDetectionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.query_intent_detection_mapper) QuerySentimentDetectionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.query_sentiment_detection_mapper) QueryTopicDetectionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.query_topic_detection_mapper) R random_sample() (in module data_juicer.utils.sample) RandomSelector (class in data_juicer.ops.selector) (class in data_juicer.ops.selector.random_selector) RangeSpecifiedFieldSelector (class in data_juicer.ops.selector) (class in data_juicer.ops.selector.range_specified_field_selector) RayBasicDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) RayBTSMinhashDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) RayDataLoadStrategy (class in data_juicer.core.data.load_strategy) RayDataset (class in data_juicer.core.data.ray_dataset) RayDocumentDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_document_deduplicator) RayEmptyFormatter (class in data_juicer.format) (class in data_juicer.format.empty_formatter) RayExecutor (class in data_juicer.core.executor.ray_executor) RayHuggingfaceDataLoadStrategy (class in data_juicer.core.data.load_strategy) RayImageDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_image_deduplicator) RayLocalJsonDataLoadStrategy (class in data_juicer.core.data.load_strategy) RayVideoDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_video_deduplicator) read_json() (data_juicer.core.data.ray_dataset.RayDataset class method) read_json_stream() (in module data_juicer.core.data.ray_dataset) read_single_partition() (in module data_juicer.utils.file_utils) rebalancing() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) record() (data_juicer.utils.ckpt_utils.CheckpointManager method) recursive_summary() (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) recursively_chunk() (data_juicer.ops.mapper.text_chunk_mapper.TextChunkMapper method) (data_juicer.ops.mapper.TextChunkMapper method) redirect_sys_output() (in module data_juicer.utils.logger_utils) RedisBackend (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) refine_single_column() (data_juicer.analysis.overall_analysis.OverallAnalysis method) (data_juicer.analysis.OverallAnalysis method) refined_words (data_juicer.utils.constant.InterVars attribute) register() (data_juicer.core.data.data_validator.DataValidatorRegistry class method) (data_juicer.core.data.load_strategy.DataLoadStrategyRegistry class method) register_attention_control() (data_juicer.ops.common.prompt2prompt_pipeline.Prompt2PromptPipeline method) register_event_handler() (data_juicer.ops.mixins.EventDrivenMixin method) register_module() (data_juicer.utils.registry.Registry method) Registry (class in data_juicer.utils.registry) RelatedTTestMeasure (class in data_juicer.analysis.measure) relation (data_juicer.utils.constant.MetaKeys attribute) relation_description (data_juicer.utils.constant.MetaKeys attribute) relation_keywords (data_juicer.utils.constant.MetaKeys attribute) relation_strength (data_juicer.utils.constant.MetaKeys attribute) RelationIdentityMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.relation_identity_mapper) relevant_characters (data_juicer.utils.constant.MetaKeys attribute) RemoteFormatter (class in data_juicer.format) (class in data_juicer.format.formatter) remove_columns() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) remove_extra_parameters() (data_juicer.ops.base_op.OP method) remove_non_special_tokens() (in module data_juicer.utils.mm_utils) remove_punctuation() (in module data_juicer.ops.filter.phrase_grounding_recall_filter) remove_special_tokens() (in module data_juicer.utils.mm_utils) RemoveBibliographyMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_bibliography_mapper) RemoveCommentsMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_comments_mapper) RemoveHeaderMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_header_mapper) RemoveLongWordsMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_long_words_mapper) RemoveNonChineseCharacterlMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_non_chinese_character_mapper) RemoveRepeatSentencesMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_repeat_sentences_mapper) RemoveSpecificCharsMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_specific_chars_mapper) RemoveTableTextMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_table_text_mapper) RemoveWordsWithIncorrectSubstringsMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper) replace_cross_attention() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionRefine method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReplace method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReweight method) replace_func() (in module data_juicer.ops.mapper.video_split_by_scene_mapper) replace_self_attention() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) ReplaceContentMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.replace_content_mapper) RequiredFieldsValidator (class in data_juicer.core.data.data_validator) rescale() (in module data_juicer.ops.mapper.video_resize_aspect_ratio_mapper) rescale_noise_cfg() (in module data_juicer.ops.common.prompt2prompt_pipeline) reset() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) reset_dependencies_cache() (data_juicer.utils.lazy_loader.LazyLoader class method) resource_monitor() (in module data_juicer.core.monitor) rewrite_cli_datapath() (in module data_juicer.core.data.dataset_builder) role_relation (data_juicer.utils.constant.MetaKeys attribute) run() (data_juicer.core.Analyzer method) (data_juicer.core.analyzer.Analyzer method) (data_juicer.core.DefaultExecutor method) (data_juicer.core.executor.base.ExecutorBase method) (data_juicer.core.executor.default_executor.DefaultExecutor method) (data_juicer.core.executor.DefaultExecutor method) (data_juicer.core.executor.ExecutorBase method) (data_juicer.core.executor.ray_executor.RayExecutor method) (data_juicer.core.ExecutorBase method) (data_juicer.ops.Aggregator method) (data_juicer.ops.base_op.Aggregator method) (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Grouper method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.base_op.OP method) (data_juicer.ops.base_op.Selector method) (data_juicer.ops.Deduplicator method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.Filter method) (data_juicer.ops.Grouper method) (data_juicer.ops.Mapper method) (data_juicer.ops.op_fusion.GeneralFusedOP method) (data_juicer.ops.Selector method) run_ner() (in module data_juicer.ops.filter.phrase_grounding_recall_filter) run_single_op() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase method) runtime_np() (data_juicer.ops.base_op.OP method) S sample_data() (data_juicer.core.DefaultExecutor method) (data_juicer.core.executor.default_executor.DefaultExecutor method) (data_juicer.core.executor.DefaultExecutor method) sampled_frames (data_juicer.utils.constant.InterVars attribute) save_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method) Schema (class in data_juicer.core.data.schema) schema() (data_juicer.core.data.dj_dataset.DJDataset method) (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.DJDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.data.ray_dataset.RayDataset method) (data_juicer.core.NestedDataset method) ScoreParams (class in data_juicer.ops.common.prompt2prompt_pipeline) SDXLPrompt2PromptMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.sdxl_prompt2prompt_mapper) select() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) select_columns() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) Selector (class in data_juicer.ops) (class in data_juicer.ops.base_op) send_notification() (data_juicer.ops.mixins.NotificationMixin method) SentenceAugmentationMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.sentence_augmentation_mapper) SentenceSplitMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.sentence_split_mapper) separate_signal_noise() (in module data_juicer.ops.filter.audio_nmf_snr_filter) set_clear_model_flag() (in module data_juicer.utils.unittest_utils) set_dataset_to_absolute_path() (in module data_juicer.core.data.ray_dataset) set_edge_buffer() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) set_edges() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) setup_logger() (in module data_juicer.utils.logger_utils) setup_model() (data_juicer.ops.filter.video_motion_score_filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.video_motion_score_raft_filter.VideoMotionScoreRaftFilter method) (data_juicer.ops.filter.VideoMotionScoreFilter method) (data_juicer.ops.filter.VideoMotionScoreRaftFilter method) setup_mp() (in module data_juicer.utils.process_utils) setup_project() (data_juicer.ops.mapper.annotation.annotation_mapper.LabelStudioAnnotationMapper method) setup_resource_aliases() (in module data_juicer.utils.nltk_utils) setUpClass() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase class method) sha1_hash32() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator) should_keep_long_word() (data_juicer.ops.mapper.remove_long_words_mapper.RemoveLongWordsMapper method) (data_juicer.ops.mapper.RemoveLongWordsMapper method) should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper.RemoveWordsWithIncorrectSubstringsMapper method) (data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method) simhash (data_juicer.utils.constant.HashKeys attribute) single_partition_write_with_filename() (in module data_juicer.utils.file_utils) size_to_bytes() (in module data_juicer.utils.mm_utils) sort_op_by_types_and_names() (in module data_juicer.config.config) source_entity (data_juicer.utils.constant.MetaKeys attribute) source_file (data_juicer.utils.constant.Fields attribute) special_char_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) SpecialCharactersFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.special_characters_filter) SpecialTokens (class in data_juicer.utils.mm_utils) SpecifiedFieldFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.specified_field_filter) SpecifiedNumericFieldFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.specified_numeric_field_filter) split_on_newline_tab_whitespace() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) split_on_whitespace() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) split_sentence() (in module data_juicer.ops.mapper.remove_repeat_sentences_mapper) split_text_by_punctuation() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) split_videos_by_duration() (data_juicer.ops.mapper.video_split_by_duration_mapper.VideoSplitByDurationMapper method) (data_juicer.ops.mapper.VideoSplitByDurationMapper method) squeeze() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) start_polling() (data_juicer.ops.mixins.EventDrivenMixin method) stats (data_juicer.utils.constant.Fields attribute) stats_to_hist() (data_juicer.analysis.measure.RelatedTTestMeasure static method) stats_to_number() (in module data_juicer.utils.common_utils) StatsKeys (class in data_juicer.utils.constant) StatsKeysConstant (class in data_juicer.utils.constant) StatsKeysMeta (class in data_juicer.utils.constant) step_callback() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) stop_all_polling() (data_juicer.ops.mixins.EventDrivenMixin method) stop_polling() (data_juicer.ops.mixins.EventDrivenMixin method) stopwords_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) StopWordsFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.stopwords_filter) STRATEGY (data_juicer.ops.mapper.video_resize_aspect_ratio_mapper.VideoResizeAspectRatioMapper attribute) (data_juicer.ops.mapper.VideoResizeAspectRatioMapper attribute) StrategyKey (class in data_juicer.core.data.load_strategy) StreamToLoguru (class in data_juicer.utils.logger_utils) strip() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) suffix (data_juicer.utils.constant.Fields attribute) SUFFIXES (data_juicer.format.csv_formatter.CsvFormatter attribute) (data_juicer.format.CsvFormatter attribute) (data_juicer.format.empty_formatter.EmptyFormatter attribute) (data_juicer.format.empty_formatter.RayEmptyFormatter attribute) (data_juicer.format.EmptyFormatter attribute) (data_juicer.format.json_formatter.JsonFormatter attribute) (data_juicer.format.JsonFormatter attribute) (data_juicer.format.parquet_formatter.ParquetFormatter attribute) (data_juicer.format.ParquetFormatter attribute) (data_juicer.format.RayEmptyFormatter attribute) (data_juicer.format.text_formatter.TextFormatter attribute) (data_juicer.format.TextFormatter attribute) (data_juicer.format.tsv_formatter.TsvFormatter attribute) (data_juicer.format.TsvFormatter attribute) SuffixFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.suffix_filter) support_text (data_juicer.utils.constant.MetaKeys attribute) SwiftMessagesValidator (class in data_juicer.core.data.data_validator) T TagsSpecifiedFieldSelector (class in data_juicer.ops.selector) (class in data_juicer.ops.selector.tags_specified_field_selector) take_batch() (data_juicer.core.Adapter static method) (data_juicer.core.adapter.Adapter static method) target_entity (data_juicer.utils.constant.MetaKeys attribute) tearDown() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase class method) tearDownClass() (data_juicer.utils.unittest_utils.DataJuicerTestCaseBase class method) TempDirManager (class in data_juicer.core.executor.ray_executor) TEST_TAG() (in module data_juicer.utils.unittest_utils) text_len (data_juicer.utils.constant.StatsKeysConstant attribute) text_pair_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) TextActionFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.text_action_filter) TextChunkMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.text_chunk_mapper) TextEntityDependencyFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.text_entity_dependency_filter) TextFormatter (class in data_juicer.format) (class in data_juicer.format.text_formatter) TextLengthFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.text_length_filter) TextPairSimilarityFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.text_pair_similarity_filter) TextTokenDistCollector (class in data_juicer.analysis.collector) TiB (data_juicer.core.Exporter attribute) (data_juicer.core.exporter.Exporter attribute) timecode_string_to_seconds() (in module data_juicer.utils.mm_utils) to_json() (data_juicer.core.Exporter static method) (data_juicer.core.exporter.Exporter static method) to_jsonl() (data_juicer.core.Exporter static method) (data_juicer.core.exporter.Exporter static method) to_parquet() (data_juicer.core.Exporter static method) (data_juicer.core.exporter.Exporter static method) TokenNumFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.token_num_filter) TopkSpecifiedFieldSelector (class in data_juicer.ops.selector) (class in data_juicer.ops.selector.topk_specified_field_selector) trace_batch_mapper() (data_juicer.core.Tracer method) (data_juicer.core.tracer.Tracer method) trace_deduplicator() (data_juicer.core.Tracer method) (data_juicer.core.tracer.Tracer method) trace_filter() (data_juicer.core.Tracer method) (data_juicer.core.tracer.Tracer method) trace_mapper() (data_juicer.core.Tracer method) (data_juicer.core.tracer.Tracer method) Tracer (class in data_juicer.core) (class in data_juicer.core.tracer) transfer_data_dir() (in module data_juicer.utils.file_utils) transfer_filename() (in module data_juicer.utils.file_utils) transform() (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) triangle_area() (in module data_juicer.ops.filter.video_ocr_area_ratio_filter) trigger_event() (data_juicer.ops.mixins.EventDrivenMixin method) TsvFormatter (class in data_juicer.format) (class in data_juicer.format.tsv_formatter) U uid (data_juicer.utils.constant.HashKeys attribute) unify_format() (in module data_juicer.format.formatter) union() (data_juicer.ops.common.helper_func.UnionFind method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) union_list() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) UnionFind (class in data_juicer.ops.common.helper_func) update() (data_juicer.utils.fingerprint_utils.Hasher method) update_alpha_time_word() (in module data_juicer.ops.common.prompt2prompt_pipeline) update_args() (data_juicer.core.data.dj_dataset.NestedDataset method) (data_juicer.core.data.NestedDataset method) (data_juicer.core.NestedDataset method) update_ds_cache_dir_and_related_vars() (in module data_juicer.config.config) update_fingerprint() (in module data_juicer.utils.fingerprint_utils) update_op_attr() (in module data_juicer.config.config) update_op_process() (in module data_juicer.config.config) update_sampling_params() (in module data_juicer.utils.model_utils) use_cuda() (data_juicer.ops.base_op.OP method) V validate() (data_juicer.core.data.data_validator.BaseConversationValidator method) (data_juicer.core.data.data_validator.CodeDataValidator method) (data_juicer.core.data.data_validator.DataValidator method) (data_juicer.core.data.data_validator.RequiredFieldsValidator method) validate_config() (data_juicer.core.data.config_validator.ConfigValidator method) validate_conversation() (data_juicer.core.data.data_validator.BaseConversationValidator method) (data_juicer.core.data.data_validator.DataJuicerFormatValidator method) (data_juicer.core.data.data_validator.SwiftMessagesValidator method) validate_snapshot_format() (in module data_juicer.download.downloader) video (data_juicer.utils.mm_utils.SpecialTokens attribute) video_aesthetic_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_aspect_ratios (data_juicer.utils.constant.StatsKeysConstant attribute) video_audio_tags (data_juicer.utils.constant.MetaKeys attribute) video_duration (data_juicer.utils.constant.StatsKeysConstant attribute) video_frame_tags (data_juicer.utils.constant.MetaKeys attribute) video_frames (data_juicer.utils.constant.MetaKeys attribute) video_frames_aesthetics_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_frames_text_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) video_height (data_juicer.utils.constant.StatsKeysConstant attribute) video_motion_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_nsfw_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_ocr_area_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) video_watermark_prob (data_juicer.utils.constant.StatsKeysConstant attribute) video_width (data_juicer.utils.constant.StatsKeysConstant attribute) VideoAestheticsFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_aesthetics_filter) VideoAspectRatioFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_aspect_ratio_filter) VideoCaptioningFromAudioMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_captioning_from_audio_mapper) VideoCaptioningFromFramesMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_captioning_from_frames_mapper) VideoCaptioningFromSummarizerMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_captioning_from_summarizer_mapper) VideoCaptioningFromVideoMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_captioning_from_video_mapper) VideoCapture() (in module data_juicer.ops.filter.video_motion_score_filter) VideoDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.video_deduplicator) VideoDurationFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_duration_filter) VideoExtractFramesMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_extract_frames_mapper) VideoFaceBlurMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_face_blur_mapper) VideoFFmpegWrappedMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper) VideoFramesTextSimilarityFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_frames_text_similarity_filter) videohash (data_juicer.utils.constant.HashKeys attribute) VideoMotionScoreFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_motion_score_filter) VideoMotionScoreRaftFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_motion_score_raft_filter) VideoNSFWFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_nsfw_filter) VideoOcrAreaRatioFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_ocr_area_ratio_filter) VideoRemoveWatermarkMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_remove_watermark_mapper) VideoResizeAspectRatioMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_resize_aspect_ratio_mapper) VideoResizeResolutionMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_resize_resolution_mapper) VideoResolutionFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_resolution_filter) VideoSplitByDurationMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_split_by_duration_mapper) VideoSplitByKeyFrameMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_split_by_key_frame_mapper) VideoSplitBySceneMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_split_by_scene_mapper) VideoTaggingFromAudioMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_tagging_from_audio_mapper) VideoTaggingFromFramesFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_tagging_from_frames_filter) VideoTaggingFromFramesMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.video_tagging_from_frames_mapper) VideoWatermarkFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.video_watermark_filter) visit_Attribute() (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) visit_BoolOp() (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) visit_Compare() (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) visit_Constant() (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) visit_Name() (data_juicer.ops.filter.general_field_filter.ExpressionTransformer method) W wait_for_completion() (data_juicer.ops.mixins.EventDrivenMixin method) WhitespaceNormalizationMapper (class in data_juicer.ops.mapper) (class in data_juicer.ops.mapper.whitespace_normalization_mapper) WikipediaDownloader (class in data_juicer.download.wikipedia) WikipediaExtractor (class in data_juicer.download.wikipedia) WikipediaIterator (class in data_juicer.download.wikipedia) word_rep_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) WordRepetitionFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.word_repetition_filter) words (data_juicer.utils.constant.InterVars attribute) words_augmentation() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) words_refinement() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) WordsNumFilter (class in data_juicer.ops.filter) (class in data_juicer.ops.filter.words_num_filter) wrap_func_with_nested_access() (in module data_juicer.core.data) (in module data_juicer.core.data.dj_dataset) write() (data_juicer.utils.logger_utils.StreamToLoguru method) Z ZstdCompressor (class in data_juicer.utils.compress)