Welcome to data-juicer’s documentation!¶
Tutorial¶
We will give a tutorial on KDD’24, Multi-modal Data Processing for Foundation Models: Practical Guidances and Use Cases, see more details here!
- data_juicer.core package
- Submodules
- data_juicer.core.adapter module
- data_juicer.core.analyzer module
- data_juicer.core.data module
- data_juicer.core.executor module
- data_juicer.core.exporter module
- data_juicer.core.monitor module
- data_juicer.core.ray_data module
- data_juicer.core.ray_executor module
- data_juicer.core.tracer module
- Module contents
- data_juicer.ops package
- data_juicer.ops.filter package
- Submodules
- data_juicer.ops.filter.alphanumeric_filter module
- data_juicer.ops.filter.audio_duration_filter module
- data_juicer.ops.filter.audio_nmf_snr_filter module
- data_juicer.ops.filter.audio_size_filter module
- data_juicer.ops.filter.average_line_length_filter module
- data_juicer.ops.filter.character_repetition_filter module
- data_juicer.ops.filter.flagged_words_filter module
- data_juicer.ops.filter.image_aesthetics_filter module
- data_juicer.ops.filter.image_aspect_ratio_filter module
- data_juicer.ops.filter.image_face_count_filter module
- data_juicer.ops.filter.image_face_ratio_filter module
- data_juicer.ops.filter.image_nsfw_filter module
- data_juicer.ops.filter.image_pair_similarity_filter module
- data_juicer.ops.filter.image_shape_filter module
- data_juicer.ops.filter.image_size_filter module
- data_juicer.ops.filter.image_text_matching_filter module
- data_juicer.ops.filter.image_text_similarity_filter module
- data_juicer.ops.filter.image_watermark_filter module
- data_juicer.ops.filter.language_id_score_filter module
- data_juicer.ops.filter.maximum_line_length_filter module
- data_juicer.ops.filter.perplexity_filter module
- data_juicer.ops.filter.phrase_grounding_recall_filter module
- data_juicer.ops.filter.special_characters_filter module
- data_juicer.ops.filter.specified_field_filter module
- data_juicer.ops.filter.specified_numeric_field_filter module
- data_juicer.ops.filter.stopwords_filter module
- data_juicer.ops.filter.suffix_filter module
- data_juicer.ops.filter.text_action_filter module
- data_juicer.ops.filter.text_entity_dependency_filter module
- data_juicer.ops.filter.text_length_filter module
- data_juicer.ops.filter.token_num_filter module
- data_juicer.ops.filter.video_aesthetics_filter module
- data_juicer.ops.filter.video_aspect_ratio_filter module
- data_juicer.ops.filter.video_duration_filter module
- data_juicer.ops.filter.video_frames_text_similarity_filter module
- data_juicer.ops.filter.video_motion_score_filter module
- data_juicer.ops.filter.video_motion_score_raft_filter module
- data_juicer.ops.filter.video_nsfw_filter module
- data_juicer.ops.filter.video_ocr_area_ratio_filter module
- data_juicer.ops.filter.video_resolution_filter module
- data_juicer.ops.filter.video_tagging_from_frames_filter module
- data_juicer.ops.filter.video_watermark_filter module
- data_juicer.ops.filter.word_repetition_filter module
- data_juicer.ops.filter.words_num_filter module
- Module contents
- data_juicer.ops.mapper package
- Submodules
- data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper module
- data_juicer.ops.mapper.calibrate_qa_mapper module
- data_juicer.ops.mapper.calibrate_query_mapper module
- data_juicer.ops.mapper.calibrate_response_mapper module
- data_juicer.ops.mapper.chinese_convert_mapper module
- data_juicer.ops.mapper.clean_copyright_mapper module
- data_juicer.ops.mapper.clean_email_mapper module
- data_juicer.ops.mapper.clean_html_mapper module
- data_juicer.ops.mapper.clean_ip_mapper module
- data_juicer.ops.mapper.clean_links_mapper module
- data_juicer.ops.mapper.expand_macro_mapper module
- data_juicer.ops.mapper.extract_entity_attribute_mapper module
- data_juicer.ops.mapper.extract_entity_relation_mapper module
- data_juicer.ops.mapper.extract_event_mapper module
- data_juicer.ops.mapper.extract_keyword_mapper module
- data_juicer.ops.mapper.extract_nickname_mapper module
- data_juicer.ops.mapper.extract_support_text_mapper module
- data_juicer.ops.mapper.fix_unicode_mapper module
- data_juicer.ops.mapper.generate_qa_from_examples_mapper module
- data_juicer.ops.mapper.generate_qa_from_text_mapper module
- data_juicer.ops.mapper.image_blur_mapper module
- data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper module
- data_juicer.ops.mapper.image_captioning_mapper module
- data_juicer.ops.mapper.image_diffusion_mapper module
- data_juicer.ops.mapper.image_face_blur_mapper module
- data_juicer.ops.mapper.image_tagging_mapper module
- data_juicer.ops.mapper.nlpaug_en_mapper module
- data_juicer.ops.mapper.nlpcda_zh_mapper module
- data_juicer.ops.mapper.optimize_qa_mapper module
- data_juicer.ops.mapper.optimize_query_mapper module
- data_juicer.ops.mapper.optimize_response_mapper module
- data_juicer.ops.mapper.pair_preference_mapper module
- data_juicer.ops.mapper.punctuation_normalization_mapper module
- data_juicer.ops.mapper.python_file_mapper module
- data_juicer.ops.mapper.python_lambda_mapper module
- data_juicer.ops.mapper.relation_identity_mapper module
- data_juicer.ops.mapper.remove_bibliography_mapper module
- data_juicer.ops.mapper.remove_comments_mapper module
- data_juicer.ops.mapper.remove_header_mapper module
- data_juicer.ops.mapper.remove_long_words_mapper module
- data_juicer.ops.mapper.remove_non_chinese_character_mapper module
- data_juicer.ops.mapper.remove_repeat_sentences_mapper module
- data_juicer.ops.mapper.remove_specific_chars_mapper module
- data_juicer.ops.mapper.remove_table_text_mapper module
- data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper module
- data_juicer.ops.mapper.replace_content_mapper module
- data_juicer.ops.mapper.sentence_split_mapper module
- data_juicer.ops.mapper.text_chunk_mapper module
- data_juicer.ops.mapper.video_captioning_from_audio_mapper module
- data_juicer.ops.mapper.video_captioning_from_frames_mapper module
- data_juicer.ops.mapper.video_captioning_from_summarizer_mapper module
- data_juicer.ops.mapper.video_captioning_from_video_mapper module
- data_juicer.ops.mapper.video_extract_frames_mapper module
- data_juicer.ops.mapper.video_face_blur_mapper module
- data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper module
- data_juicer.ops.mapper.video_remove_watermark_mapper module
- data_juicer.ops.mapper.video_resize_aspect_ratio_mapper module
- data_juicer.ops.mapper.video_resize_resolution_mapper module
- data_juicer.ops.mapper.video_split_by_duration_mapper module
- data_juicer.ops.mapper.video_split_by_key_frame_mapper module
- data_juicer.ops.mapper.video_split_by_scene_mapper module
- data_juicer.ops.mapper.video_tagging_from_audio_mapper module
- data_juicer.ops.mapper.video_tagging_from_frames_mapper module
- data_juicer.ops.mapper.whitespace_normalization_mapper module
- Module contents
- data_juicer.ops.deduplicator package
- Submodules
- data_juicer.ops.deduplicator.document_deduplicator module
- data_juicer.ops.deduplicator.document_minhash_deduplicator module
- data_juicer.ops.deduplicator.document_simhash_deduplicator module
- data_juicer.ops.deduplicator.image_deduplicator module
- data_juicer.ops.deduplicator.ray_basic_deduplicator module
- data_juicer.ops.deduplicator.ray_document_deduplicator module
- data_juicer.ops.deduplicator.ray_image_deduplicator module
- data_juicer.ops.deduplicator.ray_video_deduplicator module
- data_juicer.ops.deduplicator.video_deduplicator module
- Module contents
- data_juicer.ops.selector package
- data_juicer.ops.common package
- data_juicer.analysis package
- data_juicer.config package
- data_juicer.format package
- Submodules
- data_juicer.format.csv_formatter module
- data_juicer.format.empty_formatter module
- data_juicer.format.formatter module
- data_juicer.format.json_formatter module
- data_juicer.format.load module
- data_juicer.format.mixture_formatter module
- data_juicer.format.parquet_formatter module
- data_juicer.format.text_formatter module
- data_juicer.format.tsv_formatter module
- Module contents