import copy
import inspect
import io
import os
from enum import Enum
import zstandard as zstd
from loguru import logger
DEFAULT_PREFIX = '__dj__'
[docs]
class Fields(object):
stats = DEFAULT_PREFIX + 'stats__'
meta = DEFAULT_PREFIX + 'meta__'
context = DEFAULT_PREFIX + 'context__'
suffix = DEFAULT_PREFIX + 'suffix__'
# tags in meta
# video_frame_tags
video_frame_tags = DEFAULT_PREFIX + 'video_frame_tags__'
# video_audio_tags
video_audio_tags = DEFAULT_PREFIX + 'video_audio_tags__'
# image_tags
image_tags = DEFAULT_PREFIX + 'image_tags__'
# video_frames
video_frames = DEFAULT_PREFIX + 'video_frames__'
# the name of the original file from which this sample was derived.
source_file = DEFAULT_PREFIX + 'source_file__'
# the name of directory to store the produced multimodal data
multimodal_data_output_dir = DEFAULT_PREFIX + 'produced_data__'
# field names for info extraction
event_description = DEFAULT_PREFIX + 'event_description__'
# # a list of characters relevant to the event
relevant_characters = DEFAULT_PREFIX + 'relevant_characters__'
# # the given main entities for attribute extraction
main_entities = DEFAULT_PREFIX + 'main_entities__'
# # the given attributes to be extracted
attributes = DEFAULT_PREFIX + 'attributes__'
# # the extracted attribute descriptions
attribute_descriptions = DEFAULT_PREFIX + 'attribute_descriptions__'
# # extract from raw datas for support the attribute
attribute_support_texts = DEFAULT_PREFIX + 'attribute_support_texts__'
# # the nickname relationship
nickname = DEFAULT_PREFIX + 'nickname__'
# # the entity for knowledge graph
entity = DEFAULT_PREFIX + 'entity__'
# # # the name of entity
entity_name = DEFAULT_PREFIX + 'entity_name__'
# # # the type of entity
entity_type = DEFAULT_PREFIX + 'entity_type__'
# # # the description of entity
entity_description = DEFAULT_PREFIX + 'entity_entity_description__'
# # the relationship for knowledge graph
relation = DEFAULT_PREFIX + 'relation__'
# # # the source entity of the relation
source_entity = DEFAULT_PREFIX + 'relation_source_entity__'
# # # the target entity of the relation
target_entity = DEFAULT_PREFIX + 'relation_target_entity__'
# # # the description of the relation
relation_description = DEFAULT_PREFIX + 'relation_description__'
# # # the keywords of the relation
relation_keywords = DEFAULT_PREFIX + 'relation_keywords__'
# # # the strength of the relation
relation_strength = DEFAULT_PREFIX + 'relation_strength__'
# # the keyword in a text
keyword = DEFAULT_PREFIX + 'keyword__'
# # support text
support_text = DEFAULT_PREFIX + 'support_text__'
[docs]
class StatsKeysConstant(object):
# text
alpha_token_ratio = 'alpha_token_ratio'
alnum_ratio = 'alnum_ratio'
avg_line_length = 'avg_line_length'
char_rep_ratio = 'char_rep_ratio'
flagged_words_ratio = 'flagged_words_ratio'
lang = 'lang'
lang_score = 'lang_score'
max_line_length = 'max_line_length'
perplexity = 'perplexity'
special_char_ratio = 'special_char_ratio'
stopwords_ratio = 'stopwords_ratio'
text_len = 'text_len'
num_action = 'num_action'
num_dependency_edges = 'num_dependency_edges'
num_token = 'num_token'
num_words = 'num_words'
word_rep_ratio = 'word_rep_ratio'
# image
aspect_ratios = 'aspect_ratios'
image_width = 'image_width'
image_height = 'image_height'
image_sizes = 'image_sizes'
face_ratios = 'face_ratios'
face_detections = 'face_detections'
face_counts = 'face_counts'
image_aesthetics_scores = 'image_aesthetics_scores'
image_nsfw_score = 'image_nsfw_score'
image_watermark_prob = 'image_watermark_prob'
image_pair_similarity = 'image_pair_similarity'
# audios
audio_duration = 'audio_duration'
audio_nmf_snr = 'audio_nmf_snr'
audio_sizes = 'audio_sizes'
# videos
video_duration = 'video_duration'
video_aspect_ratios = 'video_aspect_ratios'
video_width = 'video_width'
video_height = 'video_height'
video_ocr_area_ratio = 'video_ocr_area_ratio'
video_aesthetic_score = 'video_aesthetic_score'
video_frames_aesthetics_score = 'video_frames_aesthetics_score'
video_motion_score = 'video_motion_score'
video_nsfw_score = 'video_nsfw_score'
video_watermark_prob = 'video_watermark_prob'
# multimodal
# image-text
image_text_similarity = 'image_text_similarity'
image_text_matching_score = 'image_text_matching_score'
phrase_grounding_recall = 'phrase_grounding_recall'
# video-text
video_frames_text_similarity = 'video_frames_text_similarity'
[docs]
class StatsKeys(object, metaclass=StatsKeysMeta):
_constants_class = StatsKeysConstant
[docs]
class HashKeys(object):
hash = DEFAULT_PREFIX + 'hash'
minhash = DEFAULT_PREFIX + 'minhash'
simhash = DEFAULT_PREFIX + 'simhash'
# image
imagehash = DEFAULT_PREFIX + 'imagehash'
# video
videohash = DEFAULT_PREFIX + 'videohash'
# duplicate flag
is_duplicate = DEFAULT_PREFIX + 'is_duplicate'
[docs]
class InterVars(object):
# text
lines = DEFAULT_PREFIX + 'lines'
words = DEFAULT_PREFIX + 'words'
refined_words = DEFAULT_PREFIX + 'refined_words'
# image
loaded_images = DEFAULT_PREFIX + 'loaded_images' # Image
# audios
loaded_audios = DEFAULT_PREFIX + 'loaded_audios' # (data, sampling_rate)
# videos
# InputContainer from av.
# Key: {video_path}
loaded_videos = DEFAULT_PREFIX + 'loaded_videos'
# sampled frames.
# Key: {video_path}-{frame_sampling_method}[-{frame_num}]
# {frame_num} is only used when {frame_sampling_method} is "uniform"
sampled_frames = DEFAULT_PREFIX + 'sampled_frames'
[docs]
class JobRequiredKeys(Enum):
hook = 'hook'
dj_configs = 'dj_configs'
meta_name = 'meta_name'
extra_configs = 'extra_configs'