data_juicer package¶
Subpackages¶
- data_juicer.analysis package
- Submodules
- data_juicer.analysis.collector module
- data_juicer.analysis.column_wise_analysis module
- data_juicer.analysis.diversity_analysis module
- data_juicer.analysis.draw module
- data_juicer.analysis.measure module
- data_juicer.analysis.overall_analysis module
- Module contents
- data_juicer.config package
- data_juicer.core package
- Submodules
- data_juicer.core.adapter module
- data_juicer.core.analyzer module
- data_juicer.core.data module
DJDataset
wrap_func_with_nested_access()
nested_obj_factory()
NestedQueryDict
NestedDatasetDict
NestedDataset
NestedDataset.__init__()
NestedDataset.process()
NestedDataset.update_args()
NestedDataset.map()
NestedDataset.filter()
NestedDataset.select()
NestedDataset.from_dict()
NestedDataset.add_column()
NestedDataset.select_columns()
NestedDataset.remove_columns()
NestedDataset.cleanup_cache_files()
NestedDataset.load_from_disk()
nested_query()
add_same_content_to_new_column()
- data_juicer.core.executor module
- data_juicer.core.exporter module
- data_juicer.core.monitor module
- data_juicer.core.ray_data module
- data_juicer.core.ray_executor module
- data_juicer.core.tracer module
- Module contents
Adapter
Analyzer
NestedDataset
NestedDataset.__init__()
NestedDataset.process()
NestedDataset.update_args()
NestedDataset.map()
NestedDataset.filter()
NestedDataset.select()
NestedDataset.from_dict()
NestedDataset.add_column()
NestedDataset.select_columns()
NestedDataset.remove_columns()
NestedDataset.cleanup_cache_files()
NestedDataset.load_from_disk()
Executor
Exporter
Monitor
Tracer
- data_juicer.format package
- Submodules
- data_juicer.format.csv_formatter module
- data_juicer.format.empty_formatter module
- data_juicer.format.formatter module
- data_juicer.format.json_formatter module
- data_juicer.format.load module
- data_juicer.format.mixture_formatter module
- data_juicer.format.parquet_formatter module
- data_juicer.format.text_formatter module
- data_juicer.format.tsv_formatter module
- Module contents
- data_juicer.ops package
- Subpackages
- data_juicer.ops.aggregator package
- data_juicer.ops.common package
- data_juicer.ops.deduplicator package
- Submodules
- data_juicer.ops.deduplicator.document_deduplicator module
- data_juicer.ops.deduplicator.document_minhash_deduplicator module
- data_juicer.ops.deduplicator.document_simhash_deduplicator module
- data_juicer.ops.deduplicator.image_deduplicator module
- data_juicer.ops.deduplicator.ray_basic_deduplicator module
- data_juicer.ops.deduplicator.ray_document_deduplicator module
- data_juicer.ops.deduplicator.ray_image_deduplicator module
- data_juicer.ops.deduplicator.ray_video_deduplicator module
- data_juicer.ops.deduplicator.video_deduplicator module
- Module contents
- data_juicer.ops.filter package
- Submodules
- data_juicer.ops.filter.alphanumeric_filter module
- data_juicer.ops.filter.audio_duration_filter module
- data_juicer.ops.filter.audio_nmf_snr_filter module
- data_juicer.ops.filter.audio_size_filter module
- data_juicer.ops.filter.average_line_length_filter module
- data_juicer.ops.filter.character_repetition_filter module
- data_juicer.ops.filter.flagged_words_filter module
- data_juicer.ops.filter.image_aesthetics_filter module
- data_juicer.ops.filter.image_aspect_ratio_filter module
- data_juicer.ops.filter.image_face_count_filter module
- data_juicer.ops.filter.image_face_ratio_filter module
- data_juicer.ops.filter.image_nsfw_filter module
- data_juicer.ops.filter.image_pair_similarity_filter module
- data_juicer.ops.filter.image_shape_filter module
- data_juicer.ops.filter.image_size_filter module
- data_juicer.ops.filter.image_text_matching_filter module
- data_juicer.ops.filter.image_text_similarity_filter module
- data_juicer.ops.filter.image_watermark_filter module
- data_juicer.ops.filter.language_id_score_filter module
- data_juicer.ops.filter.maximum_line_length_filter module
- data_juicer.ops.filter.perplexity_filter module
- data_juicer.ops.filter.phrase_grounding_recall_filter module
- data_juicer.ops.filter.special_characters_filter module
- data_juicer.ops.filter.specified_field_filter module
- data_juicer.ops.filter.specified_numeric_field_filter module
- data_juicer.ops.filter.stopwords_filter module
- data_juicer.ops.filter.suffix_filter module
- data_juicer.ops.filter.text_action_filter module
- data_juicer.ops.filter.text_entity_dependency_filter module
- data_juicer.ops.filter.text_length_filter module
- data_juicer.ops.filter.token_num_filter module
- data_juicer.ops.filter.video_aesthetics_filter module
- data_juicer.ops.filter.video_aspect_ratio_filter module
- data_juicer.ops.filter.video_duration_filter module
- data_juicer.ops.filter.video_frames_text_similarity_filter module
- data_juicer.ops.filter.video_motion_score_filter module
- data_juicer.ops.filter.video_motion_score_raft_filter module
- data_juicer.ops.filter.video_nsfw_filter module
- data_juicer.ops.filter.video_ocr_area_ratio_filter module
- data_juicer.ops.filter.video_resolution_filter module
- data_juicer.ops.filter.video_tagging_from_frames_filter module
- data_juicer.ops.filter.video_watermark_filter module
- data_juicer.ops.filter.word_repetition_filter module
- data_juicer.ops.filter.words_num_filter module
- Module contents
- data_juicer.ops.grouper package
- data_juicer.ops.mapper package
- Submodules
- data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper module
- data_juicer.ops.mapper.calibrate_qa_mapper module
- data_juicer.ops.mapper.calibrate_query_mapper module
- data_juicer.ops.mapper.calibrate_response_mapper module
- data_juicer.ops.mapper.chinese_convert_mapper module
- data_juicer.ops.mapper.clean_copyright_mapper module
- data_juicer.ops.mapper.clean_email_mapper module
- data_juicer.ops.mapper.clean_html_mapper module
- data_juicer.ops.mapper.clean_ip_mapper module
- data_juicer.ops.mapper.clean_links_mapper module
- data_juicer.ops.mapper.expand_macro_mapper module
- data_juicer.ops.mapper.extract_entity_attribute_mapper module
- data_juicer.ops.mapper.extract_entity_relation_mapper module
- data_juicer.ops.mapper.extract_event_mapper module
- data_juicer.ops.mapper.extract_keyword_mapper module
- data_juicer.ops.mapper.extract_nickname_mapper module
- data_juicer.ops.mapper.extract_support_text_mapper module
- data_juicer.ops.mapper.fix_unicode_mapper module
- data_juicer.ops.mapper.generate_qa_from_examples_mapper module
- data_juicer.ops.mapper.generate_qa_from_text_mapper module
- data_juicer.ops.mapper.image_blur_mapper module
- data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper module
- data_juicer.ops.mapper.image_captioning_mapper module
- data_juicer.ops.mapper.image_diffusion_mapper module
- data_juicer.ops.mapper.image_face_blur_mapper module
- data_juicer.ops.mapper.image_tagging_mapper module
- data_juicer.ops.mapper.nlpaug_en_mapper module
- data_juicer.ops.mapper.nlpcda_zh_mapper module
- data_juicer.ops.mapper.optimize_qa_mapper module
- data_juicer.ops.mapper.optimize_query_mapper module
- data_juicer.ops.mapper.optimize_response_mapper module
- data_juicer.ops.mapper.pair_preference_mapper module
- data_juicer.ops.mapper.punctuation_normalization_mapper module
- data_juicer.ops.mapper.python_file_mapper module
- data_juicer.ops.mapper.python_lambda_mapper module
- data_juicer.ops.mapper.relation_identity_mapper module
- data_juicer.ops.mapper.remove_bibliography_mapper module
- data_juicer.ops.mapper.remove_comments_mapper module
- data_juicer.ops.mapper.remove_header_mapper module
- data_juicer.ops.mapper.remove_long_words_mapper module
- data_juicer.ops.mapper.remove_non_chinese_character_mapper module
- data_juicer.ops.mapper.remove_repeat_sentences_mapper module
- data_juicer.ops.mapper.remove_specific_chars_mapper module
- data_juicer.ops.mapper.remove_table_text_mapper module
- data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper module
- data_juicer.ops.mapper.replace_content_mapper module
- data_juicer.ops.mapper.sentence_split_mapper module
- data_juicer.ops.mapper.text_chunk_mapper module
- data_juicer.ops.mapper.video_captioning_from_audio_mapper module
- data_juicer.ops.mapper.video_captioning_from_frames_mapper module
- data_juicer.ops.mapper.video_captioning_from_summarizer_mapper module
- data_juicer.ops.mapper.video_captioning_from_video_mapper module
- data_juicer.ops.mapper.video_extract_frames_mapper module
- data_juicer.ops.mapper.video_face_blur_mapper module
- data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper module
- data_juicer.ops.mapper.video_remove_watermark_mapper module
- data_juicer.ops.mapper.video_resize_aspect_ratio_mapper module
- data_juicer.ops.mapper.video_resize_resolution_mapper module
- data_juicer.ops.mapper.video_split_by_duration_mapper module
- data_juicer.ops.mapper.video_split_by_key_frame_mapper module
- data_juicer.ops.mapper.video_split_by_scene_mapper module
- data_juicer.ops.mapper.video_tagging_from_audio_mapper module
- data_juicer.ops.mapper.video_tagging_from_frames_mapper module
- data_juicer.ops.mapper.whitespace_normalization_mapper module
- Module contents
- data_juicer.ops.selector package
- Submodules
- data_juicer.ops.base_op module
- data_juicer.ops.load module
- data_juicer.ops.op_fusion module
- Module contents
- Subpackages
- data_juicer.tools package
- data_juicer.utils package
- Submodules
- data_juicer.utils.asset_utils module
- data_juicer.utils.auto_install_mapping module
- data_juicer.utils.auto_install_utils module
- data_juicer.utils.availability_utils module
- data_juicer.utils.cache_utils module
- data_juicer.utils.ckpt_utils module
- data_juicer.utils.common_utils module
- data_juicer.utils.compress module
- data_juicer.utils.constant module
Fields
Fields.stats
Fields.meta
Fields.context
Fields.suffix
Fields.video_frame_tags
Fields.video_audio_tags
Fields.image_tags
Fields.video_frames
Fields.source_file
Fields.multimodal_data_output_dir
Fields.event_description
Fields.relevant_characters
Fields.main_entities
Fields.attributes
Fields.attribute_descriptions
Fields.attribute_support_texts
Fields.nickname
Fields.entity
Fields.entity_name
Fields.entity_type
Fields.entity_description
Fields.relation
Fields.source_entity
Fields.target_entity
Fields.relation_description
Fields.relation_keywords
Fields.relation_strength
Fields.keyword
Fields.support_text
StatsKeysMeta
StatsKeysConstant
StatsKeysConstant.alpha_token_ratio
StatsKeysConstant.alnum_ratio
StatsKeysConstant.avg_line_length
StatsKeysConstant.char_rep_ratio
StatsKeysConstant.flagged_words_ratio
StatsKeysConstant.lang
StatsKeysConstant.lang_score
StatsKeysConstant.max_line_length
StatsKeysConstant.perplexity
StatsKeysConstant.special_char_ratio
StatsKeysConstant.stopwords_ratio
StatsKeysConstant.text_len
StatsKeysConstant.num_action
StatsKeysConstant.num_dependency_edges
StatsKeysConstant.num_token
StatsKeysConstant.num_words
StatsKeysConstant.word_rep_ratio
StatsKeysConstant.aspect_ratios
StatsKeysConstant.image_width
StatsKeysConstant.image_height
StatsKeysConstant.image_sizes
StatsKeysConstant.face_ratios
StatsKeysConstant.face_detections
StatsKeysConstant.face_counts
StatsKeysConstant.image_aesthetics_scores
StatsKeysConstant.image_nsfw_score
StatsKeysConstant.image_watermark_prob
StatsKeysConstant.image_pair_similarity
StatsKeysConstant.audio_duration
StatsKeysConstant.audio_nmf_snr
StatsKeysConstant.audio_sizes
StatsKeysConstant.video_duration
StatsKeysConstant.video_aspect_ratios
StatsKeysConstant.video_width
StatsKeysConstant.video_height
StatsKeysConstant.video_ocr_area_ratio
StatsKeysConstant.video_aesthetic_score
StatsKeysConstant.video_frames_aesthetics_score
StatsKeysConstant.video_motion_score
StatsKeysConstant.video_nsfw_score
StatsKeysConstant.video_watermark_prob
StatsKeysConstant.image_text_similarity
StatsKeysConstant.image_text_matching_score
StatsKeysConstant.phrase_grounding_recall
StatsKeysConstant.video_frames_text_similarity
StatsKeys
HashKeys
InterVars
JobRequiredKeys
- data_juicer.utils.file_utils module
- data_juicer.utils.fingerprint_utils module
- data_juicer.utils.lazy_loader module
- data_juicer.utils.logger_utils module
- data_juicer.utils.mm_utils module
SpecialTokens
AV_STREAM_THREAD_TYPE
get_special_tokens()
remove_special_tokens()
remove_non_special_tokens()
load_data_with_context()
load_images()
load_images_byte()
load_image()
load_image_byte()
image_path_to_base64()
image_byte_to_base64()
pil_to_opencv()
detect_faces()
get_file_size()
iou()
calculate_resized_dimensions()
load_audios()
load_audio()
load_videos()
load_video()
get_video_duration()
get_decoded_frames_from_video()
cut_video_by_seconds()
process_each_frame()
extract_key_frames_by_seconds()
extract_key_frames()
get_key_frame_seconds()
extract_video_frames_uniformly_by_seconds()
extract_video_frames_uniformly()
extract_audio_from_video()
size_to_bytes()
insert_texts_after_placeholders()
timecode_string_to_seconds()
parse_string_to_roi()
close_video()
- data_juicer.utils.model_utils module
get_backup_model_link()
check_model()
APIModel
prepare_api_model()
prepare_diffusion_model()
prepare_fasttext_model()
prepare_huggingface_model()
prepare_kenlm_model()
prepare_nltk_model()
prepare_opencv_classifier()
prepare_recognizeAnything_model()
prepare_sentencepiece_model()
prepare_sentencepiece_for_lang()
prepare_simple_aesthetics_model()
prepare_spacy_model()
prepare_video_blip_model()
prepare_vllm_model()
prepare_model()
get_model()
free_models()
- data_juicer.utils.process_utils module
- data_juicer.utils.registry module
- data_juicer.utils.resource_utils module
- data_juicer.utils.unittest_utils module
- Module contents