All modules for which code is available
- data_juicer
- data_juicer.analysis.column_wise_analysis
- data_juicer.analysis.diversity_analysis
- data_juicer.analysis.overall_analysis
- data_juicer.config.config
- data_juicer.core.adapter
- data_juicer.core.analyzer
- data_juicer.core.data
- data_juicer.core.executor
- data_juicer.core.exporter
- data_juicer.core.monitor
- data_juicer.core.tracer
- data_juicer.format.csv_formatter
- data_juicer.format.empty_formatter
- data_juicer.format.formatter
- data_juicer.format.json_formatter
- data_juicer.format.load
- data_juicer.format.mixture_formatter
- data_juicer.format.parquet_formatter
- data_juicer.format.text_formatter
- data_juicer.format.tsv_formatter
- data_juicer.ops.base_op
- data_juicer.ops.common.helper_func
- data_juicer.ops.deduplicator.document_deduplicator
- data_juicer.ops.deduplicator.document_minhash_deduplicator
- data_juicer.ops.deduplicator.document_simhash_deduplicator
- data_juicer.ops.deduplicator.image_deduplicator
- data_juicer.ops.deduplicator.ray_basic_deduplicator
- data_juicer.ops.deduplicator.ray_document_deduplicator
- data_juicer.ops.deduplicator.ray_image_deduplicator
- data_juicer.ops.deduplicator.ray_video_deduplicator
- data_juicer.ops.deduplicator.video_deduplicator
- data_juicer.ops.filter.alphanumeric_filter
- data_juicer.ops.filter.audio_duration_filter
- data_juicer.ops.filter.audio_nmf_snr_filter
- data_juicer.ops.filter.audio_size_filter
- data_juicer.ops.filter.average_line_length_filter
- data_juicer.ops.filter.character_repetition_filter
- data_juicer.ops.filter.flagged_words_filter
- data_juicer.ops.filter.image_aesthetics_filter
- data_juicer.ops.filter.image_aspect_ratio_filter
- data_juicer.ops.filter.image_face_count_filter
- data_juicer.ops.filter.image_face_ratio_filter
- data_juicer.ops.filter.image_nsfw_filter
- data_juicer.ops.filter.image_pair_similarity_filter
- data_juicer.ops.filter.image_shape_filter
- data_juicer.ops.filter.image_size_filter
- data_juicer.ops.filter.image_text_matching_filter
- data_juicer.ops.filter.image_text_similarity_filter
- data_juicer.ops.filter.image_watermark_filter
- data_juicer.ops.filter.language_id_score_filter
- data_juicer.ops.filter.maximum_line_length_filter
- data_juicer.ops.filter.perplexity_filter
- data_juicer.ops.filter.phrase_grounding_recall_filter
- data_juicer.ops.filter.special_characters_filter
- data_juicer.ops.filter.specified_field_filter
- data_juicer.ops.filter.specified_numeric_field_filter
- data_juicer.ops.filter.stopwords_filter
- data_juicer.ops.filter.suffix_filter
- data_juicer.ops.filter.text_action_filter
- data_juicer.ops.filter.text_entity_dependency_filter
- data_juicer.ops.filter.text_length_filter
- data_juicer.ops.filter.token_num_filter
- data_juicer.ops.filter.video_aesthetics_filter
- data_juicer.ops.filter.video_aspect_ratio_filter
- data_juicer.ops.filter.video_duration_filter
- data_juicer.ops.filter.video_frames_text_similarity_filter
- data_juicer.ops.filter.video_motion_score_filter
- data_juicer.ops.filter.video_nsfw_filter
- data_juicer.ops.filter.video_ocr_area_ratio_filter
- data_juicer.ops.filter.video_resolution_filter
- data_juicer.ops.filter.video_tagging_from_frames_filter
- data_juicer.ops.filter.video_watermark_filter
- data_juicer.ops.filter.word_repetition_filter
- data_juicer.ops.filter.words_num_filter
- data_juicer.ops.load
- data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper
- data_juicer.ops.mapper.chinese_convert_mapper
- data_juicer.ops.mapper.clean_copyright_mapper
- data_juicer.ops.mapper.clean_email_mapper
- data_juicer.ops.mapper.clean_html_mapper
- data_juicer.ops.mapper.clean_ip_mapper
- data_juicer.ops.mapper.clean_links_mapper
- data_juicer.ops.mapper.expand_macro_mapper
- data_juicer.ops.mapper.extract_qa_mapper
- data_juicer.ops.mapper.fix_unicode_mapper
- data_juicer.ops.mapper.generate_instruction_mapper
- data_juicer.ops.mapper.image_blur_mapper
- data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper
- data_juicer.ops.mapper.image_captioning_mapper
- data_juicer.ops.mapper.image_diffusion_mapper
- data_juicer.ops.mapper.image_face_blur_mapper
- data_juicer.ops.mapper.image_tagging_mapper
- data_juicer.ops.mapper.nlpaug_en_mapper
- data_juicer.ops.mapper.nlpcda_zh_mapper
- data_juicer.ops.mapper.optimize_instruction_mapper
- data_juicer.ops.mapper.punctuation_normalization_mapper
- data_juicer.ops.mapper.remove_bibliography_mapper
- data_juicer.ops.mapper.remove_comments_mapper
- data_juicer.ops.mapper.remove_header_mapper
- data_juicer.ops.mapper.remove_long_words_mapper
- data_juicer.ops.mapper.remove_non_chinese_character_mapper
- data_juicer.ops.mapper.remove_repeat_sentences_mapper
- data_juicer.ops.mapper.remove_specific_chars_mapper
- data_juicer.ops.mapper.remove_table_text_mapper
- data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper
- data_juicer.ops.mapper.replace_content_mapper
- data_juicer.ops.mapper.sentence_split_mapper
- data_juicer.ops.mapper.video_captioning_from_audio_mapper
- data_juicer.ops.mapper.video_captioning_from_frames_mapper
- data_juicer.ops.mapper.video_captioning_from_summarizer_mapper
- data_juicer.ops.mapper.video_captioning_from_video_mapper
- data_juicer.ops.mapper.video_face_blur_mapper
- data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper
- data_juicer.ops.mapper.video_remove_watermark_mapper
- data_juicer.ops.mapper.video_resize_aspect_ratio_mapper
- data_juicer.ops.mapper.video_resize_resolution_mapper
- data_juicer.ops.mapper.video_split_by_duration_mapper
- data_juicer.ops.mapper.video_split_by_key_frame_mapper
- data_juicer.ops.mapper.video_split_by_scene_mapper
- data_juicer.ops.mapper.video_tagging_from_audio_mapper
- data_juicer.ops.mapper.video_tagging_from_frames_mapper
- data_juicer.ops.mapper.whitespace_normalization_mapper
- data_juicer.ops.selector.frequency_specified_field_selector
- data_juicer.ops.selector.random_selector
- data_juicer.ops.selector.range_specified_field_selector
- data_juicer.ops.selector.topk_specified_field_selector