data_juicer.utils.constant module

class data_juicer.utils.constant.Fields[source]

Bases: object

stats = '__dj__stats__'
meta = '__dj__meta__'
batch_meta = '__dj__batch_meta__'
context = '__dj__context__'
suffix = '__dj__suffix__'
source_file = '__dj__source_file__'
multimodal_data_output_dir = '__dj__produced_data__'
class data_juicer.utils.constant.BatchMetaKeys[source]

Bases: object

entity_attribute = 'entity_attribute'
most_relevant_entities = 'most_relevant_entities'
class data_juicer.utils.constant.MetaKeys[source]

Bases: object

dialog_sentiment_intensity = 'dialog_sentiment_intensity'
dialog_sentiment_intensity_analysis = 'dialog_sentiment_intensity_analysis'
query_sentiment_label = 'query_sentiment_label'
query_sentiment_score = 'query_sentiment_label_score'
dialog_sentiment_labels = 'dialog_sentiment_labels'
dialog_sentiment_labels_analysis = 'dialog_sentiment_labels_analysis'
dialog_intent_labels = 'dialog_intent_labels'
dialog_intent_labels_analysis = 'dialog_intent_labels_analysis'
query_intent_label = 'query_intent_label'
query_intent_score = 'query_intent_label_score'
dialog_topic_labels = 'dialog_topic_labels'
dialog_topic_labels_analysis = 'dialog_topic_labels_analysis'
query_topic_label = 'query_topic_label'
query_topic_score = 'query_topic_label_score'
video_frame_tags = 'video_frame_tags'
video_audio_tags = 'video_audio_tags'
video_frames = 'video_frames'
image_tags = 'image_tags'
bbox_tag = '__dj__bbox__'
class_label_tag = '__dj__class_label__'
event_description = 'event_description'
relevant_characters = 'relevant_characters'
main_entities = 'main_entities'
attributes = 'attributes'
attribute_descriptions = 'attribute_descriptions'
attribute_support_texts = 'attribute_support_texts'
nickname = 'nickname'
entity = 'entity'
entity_name = 'entity_name'
entity_type = 'entity_type'
entity_description = 'entity_entity_description'
relation = 'relation'
source_entity = 'relation_source_entity'
target_entity = 'relation_target_entity'
relation_description = 'relation_description'
relation_keywords = 'relation_keywords'
relation_strength = 'relation_strength'
keyword = 'keyword'
support_text = 'support_text'
role_relation = 'role_relation'
html_tables = 'html_tables'
class data_juicer.utils.constant.StatsKeysMeta[source]

Bases: type

a helper class to track the mapping from OP’s name to its used stats_keys

e.g., # once the AlphanumericFilter’s compute_stats method has been called res = TrackingDescriptor.get_access_log() print(res) # {“AlphanumericFilter”: [“alnum_ratio”, “alpha_token_ratio”]}

get_access_log(dj_cfg=None, dataset=None)[source]
class data_juicer.utils.constant.StatsKeysConstant[source]

Bases: object

alpha_token_ratio = 'alpha_token_ratio'
alnum_ratio = 'alnum_ratio'
avg_line_length = 'avg_line_length'
char_rep_ratio = 'char_rep_ratio'
flagged_words_ratio = 'flagged_words_ratio'
in_context_influence = 'in_context_influence'
ifd_score = 'ifd_score'
lang = 'lang'
lang_score = 'lang_score'
max_line_length = 'max_line_length'
perplexity = 'perplexity'
special_char_ratio = 'special_char_ratio'
stopwords_ratio = 'stopwords_ratio'
text_len = 'text_len'
text_embd_similarity = 'text_embd_similarity'
text_pair_similarity = 'text_pair_similarity'
num_action = 'num_action'
num_dependency_edges = 'num_dependency_edges'
num_token = 'num_token'
num_words = 'num_words'
word_rep_ratio = 'word_rep_ratio'
llm_analysis_score = 'llm_analysis_score'
llm_analysis_record = 'llm_analysis_record'
llm_quality_score = 'llm_quality_score'
llm_quality_record = 'llm_quality_record'
llm_difficulty_score = 'llm_difficulty_score'
llm_difficulty_record = 'llm_difficulty_record'
llm_perplexity = 'llm_perplexity'
llm_task_relevance = 'llm_task_relevance'
llm_task_relevance_record = 'llm_task_relevance_record'
aspect_ratios = 'aspect_ratios'
image_width = 'image_width'
image_height = 'image_height'
image_sizes = 'image_sizes'
face_ratios = 'face_ratios'
face_detections = 'face_detections'
face_counts = 'face_counts'
image_aesthetics_scores = 'image_aesthetics_scores'
image_nsfw_score = 'image_nsfw_score'
image_watermark_prob = 'image_watermark_prob'
image_pair_similarity = 'image_pair_similarity'
audio_duration = 'audio_duration'
audio_nmf_snr = 'audio_nmf_snr'
audio_sizes = 'audio_sizes'
video_duration = 'video_duration'
video_aspect_ratios = 'video_aspect_ratios'
video_width = 'video_width'
video_height = 'video_height'
video_ocr_area_ratio = 'video_ocr_area_ratio'
video_aesthetic_score = 'video_aesthetic_score'
video_frames_aesthetics_score = 'video_frames_aesthetics_score'
video_motion_score = 'video_motion_score'
video_nsfw_score = 'video_nsfw_score'
video_watermark_prob = 'video_watermark_prob'
image_text_similarity = 'image_text_similarity'
image_text_matching_score = 'image_text_matching_score'
phrase_grounding_recall = 'phrase_grounding_recall'
video_frames_text_similarity = 'video_frames_text_similarity'
general_field_filter_condition = 'general_field_filter_condition'
class data_juicer.utils.constant.StatsKeys[source]

Bases: object

class data_juicer.utils.constant.HashKeys[source]

Bases: object

uid = '__dj__uid'
hash = '__dj__hash'
minhash = '__dj__minhash'
simhash = '__dj__simhash'
imagehash = '__dj__imagehash'
videohash = '__dj__videohash'
is_unique = '__dj__is_unique'
class data_juicer.utils.constant.InterVars[source]

Bases: object

lines = '__dj__lines'
words = '__dj__words'
refined_words = '__dj__refined_words'
loaded_images = '__dj__loaded_images'
loaded_audios = '__dj__loaded_audios'
loaded_videos = '__dj__loaded_videos'
sampled_frames = '__dj__sampled_frames'
class data_juicer.utils.constant.JobRequiredKeys(value)[source]

Bases: Enum

hook = 'hook'
meta_name = 'meta_name'
input = 'input'
output = 'output'
local = 'local'
dj_configs = 'dj_configs'
extra_configs = 'extra_configs'