data_juicer.utils.constant module¶
- class data_juicer.utils.constant.Fields[source]¶
Bases:
object
- stats = '__dj__stats__'¶
- meta = '__dj__meta__'¶
- batch_meta = '__dj__batch_meta__'¶
- context = '__dj__context__'¶
- suffix = '__dj__suffix__'¶
- source_file = '__dj__source_file__'¶
- multimodal_data_output_dir = '__dj__produced_data__'¶
- class data_juicer.utils.constant.BatchMetaKeys[source]¶
Bases:
object
- entity_attribute = 'entity_attribute'¶
- most_relevant_entities = 'most_relevant_entities'¶
- class data_juicer.utils.constant.MetaKeys[source]¶
Bases:
object
- dialog_sentiment_intensity = 'dialog_sentiment_intensity'¶
- dialog_sentiment_intensity_analysis = 'dialog_sentiment_intensity_analysis'¶
- query_sentiment_label = 'query_sentiment_label'¶
- query_sentiment_score = 'query_sentiment_label_score'¶
- dialog_sentiment_labels = 'dialog_sentiment_labels'¶
- dialog_sentiment_labels_analysis = 'dialog_sentiment_labels_analysis'¶
- dialog_intent_labels = 'dialog_intent_labels'¶
- dialog_intent_labels_analysis = 'dialog_intent_labels_analysis'¶
- query_intent_label = 'query_intent_label'¶
- query_intent_score = 'query_intent_label_score'¶
- dialog_topic_labels = 'dialog_topic_labels'¶
- dialog_topic_labels_analysis = 'dialog_topic_labels_analysis'¶
- query_topic_label = 'query_topic_label'¶
- query_topic_score = 'query_topic_label_score'¶
- video_frame_tags = 'video_frame_tags'¶
- video_audio_tags = 'video_audio_tags'¶
- video_frames = 'video_frames'¶
- image_tags = 'image_tags'¶
- bbox_tag = '__dj__bbox__'¶
- class_label_tag = '__dj__class_label__'¶
- event_description = 'event_description'¶
- relevant_characters = 'relevant_characters'¶
- main_entities = 'main_entities'¶
- attributes = 'attributes'¶
- attribute_descriptions = 'attribute_descriptions'¶
- attribute_support_texts = 'attribute_support_texts'¶
- nickname = 'nickname'¶
- entity = 'entity'¶
- entity_name = 'entity_name'¶
- entity_type = 'entity_type'¶
- entity_description = 'entity_entity_description'¶
- relation = 'relation'¶
- source_entity = 'relation_source_entity'¶
- target_entity = 'relation_target_entity'¶
- relation_description = 'relation_description'¶
- relation_keywords = 'relation_keywords'¶
- relation_strength = 'relation_strength'¶
- keyword = 'keyword'¶
- support_text = 'support_text'¶
- role_relation = 'role_relation'¶
- html_tables = 'html_tables'¶
- class data_juicer.utils.constant.StatsKeysMeta[source]¶
Bases:
type
a helper class to track the mapping from OP’s name to its used stats_keys
e.g., # once the AlphanumericFilter’s compute_stats method has been called res = TrackingDescriptor.get_access_log() print(res) # {“AlphanumericFilter”: [“alnum_ratio”, “alpha_token_ratio”]}
- class data_juicer.utils.constant.StatsKeysConstant[source]¶
Bases:
object
- alpha_token_ratio = 'alpha_token_ratio'¶
- alnum_ratio = 'alnum_ratio'¶
- avg_line_length = 'avg_line_length'¶
- char_rep_ratio = 'char_rep_ratio'¶
- flagged_words_ratio = 'flagged_words_ratio'¶
- in_context_influence = 'in_context_influence'¶
- ifd_score = 'ifd_score'¶
- lang = 'lang'¶
- lang_score = 'lang_score'¶
- max_line_length = 'max_line_length'¶
- perplexity = 'perplexity'¶
- special_char_ratio = 'special_char_ratio'¶
- stopwords_ratio = 'stopwords_ratio'¶
- text_len = 'text_len'¶
- text_embd_similarity = 'text_embd_similarity'¶
- text_pair_similarity = 'text_pair_similarity'¶
- num_action = 'num_action'¶
- num_dependency_edges = 'num_dependency_edges'¶
- num_token = 'num_token'¶
- num_words = 'num_words'¶
- word_rep_ratio = 'word_rep_ratio'¶
- llm_analysis_score = 'llm_analysis_score'¶
- llm_analysis_record = 'llm_analysis_record'¶
- llm_quality_score = 'llm_quality_score'¶
- llm_quality_record = 'llm_quality_record'¶
- llm_difficulty_score = 'llm_difficulty_score'¶
- llm_difficulty_record = 'llm_difficulty_record'¶
- llm_perplexity = 'llm_perplexity'¶
- llm_task_relevance = 'llm_task_relevance'¶
- llm_task_relevance_record = 'llm_task_relevance_record'¶
- aspect_ratios = 'aspect_ratios'¶
- image_width = 'image_width'¶
- image_height = 'image_height'¶
- image_sizes = 'image_sizes'¶
- face_ratios = 'face_ratios'¶
- face_detections = 'face_detections'¶
- face_counts = 'face_counts'¶
- image_aesthetics_scores = 'image_aesthetics_scores'¶
- image_nsfw_score = 'image_nsfw_score'¶
- image_watermark_prob = 'image_watermark_prob'¶
- image_pair_similarity = 'image_pair_similarity'¶
- audio_duration = 'audio_duration'¶
- audio_nmf_snr = 'audio_nmf_snr'¶
- audio_sizes = 'audio_sizes'¶
- video_duration = 'video_duration'¶
- video_aspect_ratios = 'video_aspect_ratios'¶
- video_width = 'video_width'¶
- video_height = 'video_height'¶
- video_ocr_area_ratio = 'video_ocr_area_ratio'¶
- video_aesthetic_score = 'video_aesthetic_score'¶
- video_frames_aesthetics_score = 'video_frames_aesthetics_score'¶
- video_motion_score = 'video_motion_score'¶
- video_nsfw_score = 'video_nsfw_score'¶
- video_watermark_prob = 'video_watermark_prob'¶
- image_text_similarity = 'image_text_similarity'¶
- image_text_matching_score = 'image_text_matching_score'¶
- phrase_grounding_recall = 'phrase_grounding_recall'¶
- video_frames_text_similarity = 'video_frames_text_similarity'¶
- general_field_filter_condition = 'general_field_filter_condition'¶
- class data_juicer.utils.constant.HashKeys[source]¶
Bases:
object
- uid = '__dj__uid'¶
- hash = '__dj__hash'¶
- minhash = '__dj__minhash'¶
- simhash = '__dj__simhash'¶
- imagehash = '__dj__imagehash'¶
- videohash = '__dj__videohash'¶
- is_unique = '__dj__is_unique'¶
- class data_juicer.utils.constant.InterVars[source]¶
Bases:
object
- lines = '__dj__lines'¶
- words = '__dj__words'¶
- refined_words = '__dj__refined_words'¶
- loaded_images = '__dj__loaded_images'¶
- loaded_audios = '__dj__loaded_audios'¶
- loaded_videos = '__dj__loaded_videos'¶
- sampled_frames = '__dj__sampled_frames'¶