Source code for data_juicer.utils.constant
import copy
import inspect
import io
import os
from enum import Enum
import zstandard as zstd
from loguru import logger
DEFAULT_PREFIX = '__dj__'
[docs]
class Fields(object):
# for storing stats generated by filter op
stats = DEFAULT_PREFIX + 'stats__'
# for storing metas generated by mapper op
meta = DEFAULT_PREFIX + 'meta__'
# for storing metas of batch samples generated by aggregator op
batch_meta = DEFAULT_PREFIX + 'batch_meta__'
context = DEFAULT_PREFIX + 'context__'
suffix = DEFAULT_PREFIX + 'suffix__'
# the name of the original file from which this sample was derived.
source_file = DEFAULT_PREFIX + 'source_file__'
# the name of directory to store the produced multimodal data
multimodal_data_output_dir = DEFAULT_PREFIX + 'produced_data__'
[docs]
class BatchMetaKeys(object):
entity_attribute = 'entity_attribute'
most_relavant_entities = 'most_relavant_entities'
[docs]
class MetaKeys(object):
# === text related tags ===
# # sentiment
dialog_sentiment_intensity = 'dialog_sentiment_intensity'
dialog_sentiment_intensity_analysis = 'dialog_sentiment_intensity_analysis'
query_sentiment_label = 'query_sentiment_label'
query_sentiment_score = 'query_sentiment_label_score'
dialog_sentiment_labels = 'dialog_sentiment_labels'
dialog_sentiment_labels_analysis = 'dialog_sentiment_labels_analysis'
# # intent
dialog_intent_labels = 'dialog_intent_labels'
dialog_intent_labels_analysis = 'dialog_intent_labels_analysis'
query_intent_label = 'query_intent_label'
query_intent_score = 'query_intent_label_score'
# # topic
dialog_topic_labels = 'dialog_topic_labels'
dialog_topic_labels_analysis = 'dialog_topic_labels_analysis'
query_topic_label = 'query_topic_label'
query_topic_score = 'query_topic_label_score'
# === multi-modal related tags ===
# # video-frame tags
video_frame_tags = 'video_frame_tags'
# # video-audio tags
video_audio_tags = 'video_audio_tags'
# # video frames
video_frames = 'video_frames'
# # image tags
image_tags = 'image_tags'
# bounding box tag
bbox_tag = DEFAULT_PREFIX + 'bbox__'
# === info extraction related tags ===
# # for event extraction
event_description = 'event_description'
# # a list of characters relevant to the event
relevant_characters = 'relevant_characters'
# # the given main entities for attribute extraction
main_entities = 'main_entities'
# # the given attributes to be extracted
attributes = 'attributes'
# # the extracted attribute descriptions
attribute_descriptions = 'attribute_descriptions'
# # extract from raw datas for support the attribute
attribute_support_texts = 'attribute_support_texts'
# # the nickname relationship
nickname = 'nickname'
# # the entity for knowledge graph
entity = 'entity'
# # # the name of entity
entity_name = 'entity_name'
# # # the type of entity
entity_type = 'entity_type'
# # # the description of entity
entity_description = 'entity_entity_description'
# # the relationship for knowledge graph
relation = 'relation'
# # # the source entity of the relation
source_entity = 'relation_source_entity'
# # # the target entity of the relation
target_entity = 'relation_target_entity'
# # # the description of the relation
relation_description = 'relation_description'
# # # the keywords of the relation
relation_keywords = 'relation_keywords'
# # # the strength of the relation
relation_strength = 'relation_strength'
# # the keyword in a text
keyword = 'keyword'
# # support text
support_text = 'support_text'
# # role relation
role_relation = 'role_relation'
[docs]
class StatsKeysMeta(type):
"""
a helper class to track the mapping from OP's name to its used stats_keys
e.g., # once the AlphanumericFilter's compute_stats method has been called
res = TrackingDescriptor.get_access_log()
print(res) # {"AlphanumericFilter": ["alnum_ratio", "alpha_token_ratio"]}
"""
_accessed_by = {}
def __getattr__(cls, attr):
caller_class = inspect.currentframe().f_back.f_globals['__name__']
# no need to track the parent classes
caller_class = caller_class.split('.')[-1]
stat_key = getattr(cls._constants_class, attr)
if caller_class not in cls._accessed_by:
cls._accessed_by[caller_class] = set()
if stat_key not in cls._accessed_by[caller_class]:
cls._accessed_by[caller_class].add(stat_key)
return stat_key
[docs]
def get_access_log(cls, dj_cfg=None):
if cls._accessed_by:
return cls._accessed_by
elif dj_cfg:
tmp_dj_cfg = copy.deepcopy(dj_cfg)
# the access has been skipped due to the use of cache
# we will using a temp data sample to get the access log
if os.path.exists(dj_cfg.dataset_path) and \
('jsonl' in dj_cfg.dataset_path or
'jsonl.zst' in dj_cfg.dataset_path):
logger.info(
'Begin to track the usage of ops with a dummy data sample')
# load the first line as tmp_data
tmp_f_name = None
first_line = None
if 'jsonl.zst' in dj_cfg.dataset_path:
tmp_f_name = dj_cfg.dataset_path. \
replace('.jsonl.zst', '.tmp.jsonl')
# Open the file in binary mode and
# create a Zstandard decompression context
with open(dj_cfg.dataset_path, 'rb') as compressed_file:
dctx = zstd.ZstdDecompressor()
# Create a stream reader for the file and decode the
# first line
with dctx.stream_reader(compressed_file) as reader:
text_stream = io.TextIOWrapper(reader,
encoding='utf-8')
first_line = text_stream.readline()
elif 'jsonl' in dj_cfg.dataset_path:
tmp_f_name = dj_cfg.dataset_path. \
replace('.jsonl', '.tmp.jsonl')
with open(dj_cfg.dataset_path, 'r') as orig_file:
first_line = orig_file.readline()
assert tmp_f_name is not None and first_line is not None, \
'error when loading the first line, when ' \
f'dj_cfg.dataset_path={dj_cfg.dataset_path}'
with open(tmp_f_name, 'w') as tmp_file:
tmp_file.write(first_line)
tmp_dj_cfg.dataset_path = tmp_f_name
tmp_dj_cfg.use_cache = False
tmp_dj_cfg.use_checkpoint = False
from data_juicer.config import get_init_configs
tmp_dj_cfg = get_init_configs(tmp_dj_cfg)
from data_juicer.core import Analyzer
tmp_analyzer = Analyzer(tmp_dj_cfg)
# do not overwrite the true analysis results
tmp_analyzer.run(skip_export=True)
os.remove(tmp_f_name)
else:
raise NotImplementedError(
f'For now, the dummy data is supported for only jsonl type'
f'. Please check your config as {dj_cfg.dataset_path} is '
f'either not existed or in jsonl type.')
return cls._accessed_by
[docs]
class StatsKeysConstant(object):
# === text ===
alpha_token_ratio = 'alpha_token_ratio'
alnum_ratio = 'alnum_ratio'
avg_line_length = 'avg_line_length'
char_rep_ratio = 'char_rep_ratio'
flagged_words_ratio = 'flagged_words_ratio'
lang = 'lang'
lang_score = 'lang_score'
max_line_length = 'max_line_length'
perplexity = 'perplexity'
special_char_ratio = 'special_char_ratio'
stopwords_ratio = 'stopwords_ratio'
text_len = 'text_len'
text_pair_similarity = 'text_pair_similarity'
num_action = 'num_action'
num_dependency_edges = 'num_dependency_edges'
num_token = 'num_token'
num_words = 'num_words'
word_rep_ratio = 'word_rep_ratio'
# === image ===
aspect_ratios = 'aspect_ratios'
image_width = 'image_width'
image_height = 'image_height'
image_sizes = 'image_sizes'
face_ratios = 'face_ratios'
face_detections = 'face_detections'
face_counts = 'face_counts'
image_aesthetics_scores = 'image_aesthetics_scores'
image_nsfw_score = 'image_nsfw_score'
image_watermark_prob = 'image_watermark_prob'
image_pair_similarity = 'image_pair_similarity'
# === audios ===
audio_duration = 'audio_duration'
audio_nmf_snr = 'audio_nmf_snr'
audio_sizes = 'audio_sizes'
# === videos ===
video_duration = 'video_duration'
video_aspect_ratios = 'video_aspect_ratios'
video_width = 'video_width'
video_height = 'video_height'
video_ocr_area_ratio = 'video_ocr_area_ratio'
video_aesthetic_score = 'video_aesthetic_score'
video_frames_aesthetics_score = 'video_frames_aesthetics_score'
video_motion_score = 'video_motion_score'
video_nsfw_score = 'video_nsfw_score'
video_watermark_prob = 'video_watermark_prob'
# === multimodal ===
# image-text
image_text_similarity = 'image_text_similarity'
image_text_matching_score = 'image_text_matching_score'
phrase_grounding_recall = 'phrase_grounding_recall'
# video-text
video_frames_text_similarity = 'video_frames_text_similarity'
[docs]
class HashKeys(object):
uid = DEFAULT_PREFIX + 'uid'
hash = DEFAULT_PREFIX + 'hash'
minhash = DEFAULT_PREFIX + 'minhash'
simhash = DEFAULT_PREFIX + 'simhash'
# image
imagehash = DEFAULT_PREFIX + 'imagehash'
# video
videohash = DEFAULT_PREFIX + 'videohash'
# duplicate flag
is_unique = DEFAULT_PREFIX + 'is_unique'
[docs]
class InterVars(object):
# === text ===
lines = DEFAULT_PREFIX + 'lines'
words = DEFAULT_PREFIX + 'words'
refined_words = DEFAULT_PREFIX + 'refined_words'
# === image ===
loaded_images = DEFAULT_PREFIX + 'loaded_images' # Image
# === audios ===
loaded_audios = DEFAULT_PREFIX + 'loaded_audios' # (data, sampling_rate)
# === videos ===
# # InputContainer from av.
# # Key: {video_path}
loaded_videos = DEFAULT_PREFIX + 'loaded_videos'
# sampled frames.
# # Key: {video_path}-{frame_sampling_method}[-{frame_num}]
# # {frame_num} is only used when {frame_sampling_method} is "uniform"
sampled_frames = DEFAULT_PREFIX + 'sampled_frames'
[docs]
class JobRequiredKeys(Enum):
hook = 'hook'
dj_configs = 'dj_configs'
meta_name = 'meta_name'
extra_configs = 'extra_configs'