data_juicer
API Reference
data_juicer.core
data_juicer.ops
data_juicer.ops.filter
data_juicer.ops.mapper
data_juicer.ops.deduplicator
data_juicer.ops.selector
data_juicer.ops.common
data_juicer.analysis
data_juicer.config
data_juicer.format
data_juicer
Index
Index
_
|
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
J
|
K
|
L
|
M
|
N
|
O
|
P
|
Q
|
R
|
S
|
T
|
U
|
V
|
W
_
__init__() (data_juicer.analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.DiversityAnalysis method)
(data_juicer.analysis.OverallAnalysis method)
(data_juicer.core.Adapter method)
(data_juicer.core.Analyzer method)
(data_juicer.core.data.NestedDataset method)
(data_juicer.core.DefaultExecutor method)
(data_juicer.core.executor.DefaultExecutor method)
(data_juicer.core.executor.ExecutorBase method)
(data_juicer.core.ExecutorBase method)
(data_juicer.core.Exporter method)
(data_juicer.core.Monitor method)
(data_juicer.core.NestedDataset method)
(data_juicer.core.Tracer method)
(data_juicer.format.CsvFormatter method)
(data_juicer.format.EmptyFormatter method)
(data_juicer.format.JsonFormatter method)
(data_juicer.format.LocalFormatter method)
(data_juicer.format.ParquetFormatter method)
(data_juicer.format.RayEmptyFormatter method)
(data_juicer.format.RemoteFormatter method)
(data_juicer.format.TextFormatter method)
(data_juicer.format.TsvFormatter method)
(data_juicer.ops.Aggregator method)
(data_juicer.ops.aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.MetaTagsAggregator method)
(data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method)
(data_juicer.ops.aggregator.NestedAggregator method)
(data_juicer.ops.Deduplicator method)
(data_juicer.ops.deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method)
(data_juicer.ops.deduplicator.RayDocumentDeduplicator method)
(data_juicer.ops.deduplicator.RayImageDeduplicator method)
(data_juicer.ops.deduplicator.RayVideoDeduplicator method)
(data_juicer.ops.deduplicator.VideoDeduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.AlphanumericFilter method)
(data_juicer.ops.filter.AudioDurationFilter method)
(data_juicer.ops.filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.AudioSizeFilter method)
(data_juicer.ops.filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.FlaggedWordFilter method)
(data_juicer.ops.filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.ImageNSFWFilter method)
(data_juicer.ops.filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.ImageShapeFilter method)
(data_juicer.ops.filter.ImageSizeFilter method)
(data_juicer.ops.filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.LLMDifficultyScoreFilter method)
(data_juicer.ops.filter.LLMQualityScoreFilter method)
(data_juicer.ops.filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.PerplexityFilter method)
(data_juicer.ops.filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.StopWordsFilter method)
(data_juicer.ops.filter.SuffixFilter method)
(data_juicer.ops.filter.TextActionFilter method)
(data_juicer.ops.filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.TextLengthFilter method)
(data_juicer.ops.filter.TextPairSimilarityFilter method)
(data_juicer.ops.filter.TokenNumFilter method)
(data_juicer.ops.filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.VideoDurationFilter method)
(data_juicer.ops.filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoMotionScoreRaftFilter method)
(data_juicer.ops.filter.VideoNSFWFilter method)
(data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.VideoResolutionFilter method)
(data_juicer.ops.filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.VideoWatermarkFilter method)
(data_juicer.ops.filter.WordRepetitionFilter method)
(data_juicer.ops.filter.WordsNumFilter method)
(data_juicer.ops.Grouper method)
(data_juicer.ops.grouper.KeyValueGrouper method)
(data_juicer.ops.grouper.NaiveGrouper method)
(data_juicer.ops.grouper.NaiveReverseGrouper method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.mapper.AudioAddGaussianNoiseMapper method)
(data_juicer.ops.mapper.AudioFFmpegWrappedMapper method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.ChineseConvertMapper method)
(data_juicer.ops.mapper.CleanCopyrightMapper method)
(data_juicer.ops.mapper.CleanEmailMapper method)
(data_juicer.ops.mapper.CleanHtmlMapper method)
(data_juicer.ops.mapper.CleanIpMapper method)
(data_juicer.ops.mapper.CleanLinksMapper method)
(data_juicer.ops.mapper.DialogIntentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper method)
(data_juicer.ops.mapper.DialogTopicDetectionMapper method)
(data_juicer.ops.mapper.ExpandMacroMapper method)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.ExtractSupportTextMapper method)
(data_juicer.ops.mapper.ExtractTablesFromHtmlMapper method)
(data_juicer.ops.mapper.FixUnicodeMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.HumanPreferenceAnnotationMapper method)
(data_juicer.ops.mapper.ImageBlurMapper method)
(data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method)
(data_juicer.ops.mapper.ImageCaptioningMapper method)
(data_juicer.ops.mapper.ImageDiffusionMapper method)
(data_juicer.ops.mapper.ImageFaceBlurMapper method)
(data_juicer.ops.mapper.ImageRemoveBackgroundMapper method)
(data_juicer.ops.mapper.ImageSegmentMapper method)
(data_juicer.ops.mapper.ImageTaggingMapper method)
(data_juicer.ops.mapper.MllmMapper method)
(data_juicer.ops.mapper.NlpaugEnMapper method)
(data_juicer.ops.mapper.NlpcdaZhMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.PunctuationNormalizationMapper method)
(data_juicer.ops.mapper.PythonFileMapper method)
(data_juicer.ops.mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.QueryIntentDetectionMapper method)
(data_juicer.ops.mapper.QuerySentimentDetectionMapper method)
(data_juicer.ops.mapper.QueryTopicDetectionMapper method)
(data_juicer.ops.mapper.RelationIdentityMapper method)
(data_juicer.ops.mapper.RemoveBibliographyMapper method)
(data_juicer.ops.mapper.RemoveCommentsMapper method)
(data_juicer.ops.mapper.RemoveHeaderMapper method)
(data_juicer.ops.mapper.RemoveLongWordsMapper method)
(data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper method)
(data_juicer.ops.mapper.RemoveRepeatSentencesMapper method)
(data_juicer.ops.mapper.RemoveSpecificCharsMapper method)
(data_juicer.ops.mapper.RemoveTableTextMapper method)
(data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method)
(data_juicer.ops.mapper.ReplaceContentMapper method)
(data_juicer.ops.mapper.SDXLPrompt2PromptMapper method)
(data_juicer.ops.mapper.SentenceAugmentationMapper method)
(data_juicer.ops.mapper.SentenceSplitMapper method)
(data_juicer.ops.mapper.TextChunkMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromAudioMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromFramesMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromVideoMapper method)
(data_juicer.ops.mapper.VideoExtractFramesMapper method)
(data_juicer.ops.mapper.VideoFaceBlurMapper method)
(data_juicer.ops.mapper.VideoFFmpegWrappedMapper method)
(data_juicer.ops.mapper.VideoRemoveWatermarkMapper method)
(data_juicer.ops.mapper.VideoResizeAspectRatioMapper method)
(data_juicer.ops.mapper.VideoResizeResolutionMapper method)
(data_juicer.ops.mapper.VideoSplitByDurationMapper method)
(data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
(data_juicer.ops.mapper.VideoSplitBySceneMapper method)
(data_juicer.ops.mapper.VideoTaggingFromAudioMapper method)
(data_juicer.ops.mapper.VideoTaggingFromFramesMapper method)
(data_juicer.ops.mapper.WhitespaceNormalizationMapper method)
(data_juicer.ops.Selector method)
(data_juicer.ops.selector.FrequencySpecifiedFieldSelector method)
(data_juicer.ops.selector.RandomSelector method)
(data_juicer.ops.selector.RangeSpecifiedFieldSelector method)
(data_juicer.ops.selector.TagsSpecifiedFieldSelector method)
(data_juicer.ops.selector.TopkSpecifiedFieldSelector method)
A
adapt_workloads() (data_juicer.core.Adapter method)
Adapter (class in data_juicer.core)
add_column() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
add_message() (data_juicer.ops.mapper.ExtractEntityRelationMapper method)
add_same_content_to_new_column() (in module data_juicer.core.data)
Aggregator (class in data_juicer.ops)
AlphanumericFilter (class in data_juicer.ops.filter)
analyze() (data_juicer.analysis.ColumnWiseAnalysis method)
(data_juicer.analysis.DiversityAnalysis method)
(data_juicer.analysis.OverallAnalysis method)
analyze_resource_util_list() (data_juicer.core.Monitor static method)
analyze_single_resource_util() (data_juicer.core.Monitor static method)
analyze_small_batch() (data_juicer.core.Adapter method)
Analyzer (class in data_juicer.core)
attribute_summary() (data_juicer.ops.aggregator.EntityAttributeAggregator method)
AudioAddGaussianNoiseMapper (class in data_juicer.ops.mapper)
AudioDurationFilter (class in data_juicer.ops.filter)
AudioFFmpegWrappedMapper (class in data_juicer.ops.mapper)
AudioNMFSNRFilter (class in data_juicer.ops.filter)
AudioSizeFilter (class in data_juicer.ops.filter)
avaliable_detectors (data_juicer.ops.mapper.VideoSplitBySceneMapper attribute)
AverageLineLengthFilter (class in data_juicer.ops.filter)
B
batch_size_strategy() (data_juicer.core.Adapter method)
build_input() (data_juicer.ops.filter.LLMDifficultyScoreFilter method)
(data_juicer.ops.filter.LLMQualityScoreFilter method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.DialogIntentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper method)
(data_juicer.ops.mapper.DialogTopicDetectionMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
C
calc_minhash() (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method)
calculate_hash() (data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.deduplicator.RayDocumentDeduplicator method)
(data_juicer.ops.deduplicator.RayImageDeduplicator method)
(data_juicer.ops.deduplicator.RayVideoDeduplicator method)
CalibrateQAMapper (class in data_juicer.ops.mapper)
CalibrateQueryMapper (class in data_juicer.ops.mapper)
CalibrateResponseMapper (class in data_juicer.ops.mapper)
CharacterRepetitionFilter (class in data_juicer.ops.filter)
ChineseConvertMapper (class in data_juicer.ops.mapper)
CleanCopyrightMapper (class in data_juicer.ops.mapper)
CleanEmailMapper (class in data_juicer.ops.mapper)
CleanHtmlMapper (class in data_juicer.ops.mapper)
CleanIpMapper (class in data_juicer.ops.mapper)
CleanLinksMapper (class in data_juicer.ops.mapper)
cleanup_cache_files() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
ColumnWiseAnalysis (class in data_juicer.analysis)
compute() (data_juicer.analysis.DiversityAnalysis method)
compute_flow() (data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoMotionScoreRaftFilter method)
compute_hash() (data_juicer.ops.Deduplicator method)
(data_juicer.ops.deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.VideoDeduplicator method)
compute_stats_batched() (data_juicer.ops.Filter method)
(data_juicer.ops.filter.AlphanumericFilter method)
(data_juicer.ops.filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.FlaggedWordFilter method)
(data_juicer.ops.filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.PerplexityFilter method)
(data_juicer.ops.filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.TextLengthFilter method)
(data_juicer.ops.filter.WordRepetitionFilter method)
(data_juicer.ops.filter.WordsNumFilter method)
compute_stats_single() (data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.AudioDurationFilter method)
(data_juicer.ops.filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.AudioSizeFilter method)
(data_juicer.ops.filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.ImageNSFWFilter method)
(data_juicer.ops.filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.ImageShapeFilter method)
(data_juicer.ops.filter.ImageSizeFilter method)
(data_juicer.ops.filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.LLMDifficultyScoreFilter method)
(data_juicer.ops.filter.LLMQualityScoreFilter method)
(data_juicer.ops.filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.StopWordsFilter method)
(data_juicer.ops.filter.SuffixFilter method)
(data_juicer.ops.filter.TextActionFilter method)
(data_juicer.ops.filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.TextPairSimilarityFilter method)
(data_juicer.ops.filter.TokenNumFilter method)
(data_juicer.ops.filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.VideoDurationFilter method)
(data_juicer.ops.filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoNSFWFilter method)
(data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.VideoResolutionFilter method)
(data_juicer.ops.filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.VideoWatermarkFilter method)
create_executor() (data_juicer.core.executor.ExecutorFactory static method)
(data_juicer.core.ExecutorFactory static method)
CsvFormatter (class in data_juicer.format)
cuda_device_count() (in module data_juicer)
D
data_juicer
module
data_juicer.analysis
module
data_juicer.config
module
data_juicer.core
module
data_juicer.core.data
module
data_juicer.core.executor
module
data_juicer.download
module
data_juicer.format
module
data_juicer.ops
module
data_juicer.ops.aggregator
module
data_juicer.ops.common
module
data_juicer.ops.deduplicator
module
data_juicer.ops.filter
module
data_juicer.ops.grouper
module
data_juicer.ops.mapper
module
data_juicer.ops.selector
module
data_juicer.tools
module
data_juicer.utils
module
Deduplicator (class in data_juicer.ops)
DEFAULT_ANALYSIS_PATTERN (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
DEFAULT_ANALYSIS_TEMPLATE (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
DEFAULT_ATTR_PATTERN_TEMPLATE (data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
DEFAULT_CANDIDATES_TEMPLATE (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
DEFAULT_COMPLETION_DELIMITER (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractKeywordMapper attribute)
DEFAULT_CONTINUE_PROMPT (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_DEMON_PATTERN (data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
DEFAULT_ENTITY_PATTERN (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_ENTITY_TYPES (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_EXAMPLE_PROMPT (data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
DEFAULT_EXAMPLE_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
DEFAULT_FIELD_TEMPLATE (data_juicer.ops.filter.LLMDifficultyScoreFilter attribute)
(data_juicer.ops.filter.LLMQualityScoreFilter attribute)
DEFAULT_IF_LOOP_PROMPT (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_INPUT_TEMPLATE (data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.MetaTagsAggregator attribute)
(data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute)
(data_juicer.ops.aggregator.NestedAggregator attribute)
(data_juicer.ops.filter.LLMDifficultyScoreFilter attribute)
(data_juicer.ops.filter.LLMQualityScoreFilter attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.ExtractSupportTextMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.PairPreferenceMapper attribute)
(data_juicer.ops.mapper.RelationIdentityMapper attribute)
DEFAULT_INTENSITY_PATTERN (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute)
DEFAULT_INTENSITY_TEMPLATE (data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute)
DEFAULT_LABEL_CONFIG (data_juicer.ops.mapper.HumanPreferenceAnnotationMapper attribute)
DEFAULT_LABELS_PATTERN (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
DEFAULT_LABELS_TEMPLATE (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
DEFAULT_OUTPUT_PATTERN (data_juicer.ops.aggregator.MetaTagsAggregator attribute)
(data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.ExtractKeywordMapper attribute)
(data_juicer.ops.mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.PairPreferenceMapper attribute)
DEFAULT_OUTPUT_PATTERN_TEMPLATE (data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.mapper.RelationIdentityMapper attribute)
DEFAULT_PROMPT_TEMPLATE (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
(data_juicer.ops.mapper.ExtractKeywordMapper attribute)
DEFAULT_QA_PAIR_TEMPLATE (data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
DEFAULT_QUERY_TEMPLATE (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
DEFAULT_RECORD_DELIMITER (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_REFERENCE_TEMPLATE (data_juicer.ops.mapper.CalibrateQAMapper attribute)
DEFAULT_RELATION_PATTERN (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DEFAULT_RESPONSE_TEMPLATE (data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
DEFAULT_SUB_DOC_TEMPLATE (data_juicer.ops.aggregator.NestedAggregator attribute)
DEFAULT_SYSTEM_PROMPT (data_juicer.ops.aggregator.MetaTagsAggregator attribute)
(data_juicer.ops.aggregator.NestedAggregator attribute)
(data_juicer.ops.filter.LLMDifficultyScoreFilter attribute)
(data_juicer.ops.filter.LLMQualityScoreFilter attribute)
(data_juicer.ops.mapper.CalibrateQAMapper attribute)
(data_juicer.ops.mapper.CalibrateQueryMapper attribute)
(data_juicer.ops.mapper.CalibrateResponseMapper attribute)
(data_juicer.ops.mapper.DialogIntentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper attribute)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper attribute)
(data_juicer.ops.mapper.DialogTopicDetectionMapper attribute)
(data_juicer.ops.mapper.ExtractEventMapper attribute)
(data_juicer.ops.mapper.ExtractNicknameMapper attribute)
(data_juicer.ops.mapper.ExtractSupportTextMapper attribute)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
(data_juicer.ops.mapper.OptimizeQAMapper attribute)
(data_juicer.ops.mapper.OptimizeQueryMapper attribute)
(data_juicer.ops.mapper.OptimizeResponseMapper attribute)
(data_juicer.ops.mapper.PairPreferenceMapper attribute)
DEFAULT_SYSTEM_PROMPT_TEMPLATE (data_juicer.ops.mapper.ExtractEntityAttributeMapper attribute)
(data_juicer.ops.mapper.RelationIdentityMapper attribute)
DEFAULT_SYSTEM_TEMPLATE (data_juicer.ops.aggregator.EntityAttributeAggregator attribute)
(data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute)
DEFAULT_TAG_TEMPLATE (data_juicer.ops.aggregator.MetaTagsAggregator attribute)
DEFAULT_TARGET_TAG_TEMPLATE (data_juicer.ops.aggregator.MetaTagsAggregator attribute)
DEFAULT_TUPLE_DELIMITER (data_juicer.ops.mapper.ExtractEntityRelationMapper attribute)
DefaultExecutor (class in data_juicer.core)
(class in data_juicer.core.executor)
DialogIntentDetectionMapper (class in data_juicer.ops.mapper)
DialogSentimentDetectionMapper (class in data_juicer.ops.mapper)
DialogSentimentIntensityMapper (class in data_juicer.ops.mapper)
DialogTopicDetectionMapper (class in data_juicer.ops.mapper)
DiversityAnalysis (class in data_juicer.analysis)
DJDataset (class in data_juicer.core.data)
DocumentDeduplicator (class in data_juicer.ops.deduplicator)
DocumentMinhashDeduplicator (class in data_juicer.ops.deduplicator)
DocumentSimhashDeduplicator (class in data_juicer.ops.deduplicator)
draw_box() (data_juicer.analysis.ColumnWiseAnalysis method)
draw_hist() (data_juicer.analysis.ColumnWiseAnalysis method)
draw_resource_util_graph() (data_juicer.core.Monitor static method)
draw_wordcloud() (data_juicer.analysis.ColumnWiseAnalysis method)
DYNAMIC_FIELDS (data_juicer.core.Monitor attribute)
E
EMPTY_HASH_VALUE (data_juicer.ops.deduplicator.RayBasicDeduplicator attribute)
(data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator attribute)
EmptyFormatter (class in data_juicer.format)
EntityAttributeAggregator (class in data_juicer.ops.aggregator)
execute_and_probe() (data_juicer.core.Adapter static method)
ExecutorBase (class in data_juicer.core)
(class in data_juicer.core.executor)
ExecutorFactory (class in data_juicer.core)
(class in data_juicer.core.executor)
ExpandMacroMapper (class in data_juicer.ops.mapper)
export() (data_juicer.core.Exporter method)
export_compute_stats() (data_juicer.core.Exporter method)
export_config() (in module data_juicer.config)
Exporter (class in data_juicer.core)
ExtractEntityAttributeMapper (class in data_juicer.ops.mapper)
ExtractEntityRelationMapper (class in data_juicer.ops.mapper)
ExtractEventMapper (class in data_juicer.ops.mapper)
ExtractKeywordMapper (class in data_juicer.ops.mapper)
ExtractNicknameMapper (class in data_juicer.ops.mapper)
ExtractSupportTextMapper (class in data_juicer.ops.mapper)
ExtractTablesFromHtmlMapper (class in data_juicer.ops.mapper)
F
Filter (class in data_juicer.ops)
filter() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
filter_with_union_find() (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method)
FixUnicodeMapper (class in data_juicer.ops.mapper)
FlaggedWordFilter (class in data_juicer.ops.filter)
FrequencySpecifiedFieldSelector (class in data_juicer.ops.selector)
from_dict() (data_juicer.core.data.NestedDataset class method)
(data_juicer.core.NestedDataset class method)
G
GenerateQAFromExamplesMapper (class in data_juicer.ops.mapper)
GenerateQAFromTextMapper (class in data_juicer.ops.mapper)
get() (data_juicer.core.data.DJDataset method)
(data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
get_column() (data_juicer.core.data.DJDataset method)
(data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
get_default_cfg() (in module data_juicer.config)
get_init_configs() (in module data_juicer.config)
get_reader() (data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
get_sentences_from_document() (in module data_juicer.ops.common)
get_split_key_frame() (data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
get_text_chunks() (data_juicer.ops.mapper.TextChunkMapper method)
get_words_from_document() (in module data_juicer.ops.common)
GiB (data_juicer.core.Exporter attribute)
Grouper (class in data_juicer.ops)
H
HumanPreferenceAnnotationMapper (class in data_juicer.ops.mapper)
I
ImageAestheticsFilter (class in data_juicer.ops.filter)
ImageAspectRatioFilter (class in data_juicer.ops.filter)
ImageBlurMapper (class in data_juicer.ops.mapper)
ImageCaptioningFromGPT4VMapper (class in data_juicer.ops.mapper)
ImageCaptioningMapper (class in data_juicer.ops.mapper)
ImageDeduplicator (class in data_juicer.ops.deduplicator)
ImageDiffusionMapper (class in data_juicer.ops.mapper)
ImageFaceBlurMapper (class in data_juicer.ops.mapper)
ImageFaceCountFilter (class in data_juicer.ops.filter)
ImageFaceRatioFilter (class in data_juicer.ops.filter)
ImageNSFWFilter (class in data_juicer.ops.filter)
ImagePairSimilarityFilter (class in data_juicer.ops.filter)
ImageRemoveBackgroundMapper (class in data_juicer.ops.mapper)
ImageSegmentMapper (class in data_juicer.ops.mapper)
ImageShapeFilter (class in data_juicer.ops.filter)
ImageSizeFilter (class in data_juicer.ops.filter)
ImageTaggingMapper (class in data_juicer.ops.mapper)
ImageTextMatchingFilter (class in data_juicer.ops.filter)
ImageTextSimilarityFilter (class in data_juicer.ops.filter)
ImageWatermarkFilter (class in data_juicer.ops.filter)
init_configs() (in module data_juicer.config)
insight_mining() (data_juicer.core.Adapter method)
is_cuda_available() (in module data_juicer)
J
JsonFormatter (class in data_juicer.format)
K
KeyValueGrouper (class in data_juicer.ops.grouper)
KiB (data_juicer.core.Exporter attribute)
L
LanguageIDScoreFilter (class in data_juicer.ops.filter)
light_rag_extraction() (data_juicer.ops.mapper.ExtractEntityRelationMapper method)
LLMDifficultyScoreFilter (class in data_juicer.ops.filter)
LLMQualityScoreFilter (class in data_juicer.ops.filter)
load_dataset() (data_juicer.format.EmptyFormatter method)
(data_juicer.format.LocalFormatter method)
(data_juicer.format.RayEmptyFormatter method)
(data_juicer.format.RemoteFormatter method)
(data_juicer.format.TextFormatter method)
load_from_disk() (data_juicer.core.data.NestedDataset static method)
(data_juicer.core.NestedDataset static method)
load_ops() (in module data_juicer.ops)
LocalFormatter (class in data_juicer.format)
M
map() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
Mapper (class in data_juicer.ops)
MAX_BATCH_SIZE (data_juicer.core.Adapter attribute)
MaximumLineLengthFilter (class in data_juicer.ops.filter)
merge() (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method)
merge_config() (in module data_juicer.config)
merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)
merge_op_batch() (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method)
meta_map() (data_juicer.ops.aggregator.MetaTagsAggregator method)
MetaTagsAggregator (class in data_juicer.ops.aggregator)
MiB (data_juicer.core.Exporter attribute)
MllmMapper (class in data_juicer.ops.mapper)
module
data_juicer
data_juicer.analysis
data_juicer.config
data_juicer.core
data_juicer.core.data
data_juicer.core.executor
data_juicer.download
data_juicer.format
data_juicer.ops
data_juicer.ops.aggregator
data_juicer.ops.common
data_juicer.ops.deduplicator
data_juicer.ops.filter
data_juicer.ops.grouper
data_juicer.ops.mapper
data_juicer.ops.selector
data_juicer.tools
data_juicer.utils
Monitor (class in data_juicer.core)
monitor_all_resources() (data_juicer.core.Monitor method)
monitor_current_resources() (data_juicer.core.Monitor static method)
monitor_func() (data_juicer.core.Monitor static method)
MostRelevantEntitiesAggregator (class in data_juicer.ops.aggregator)
N
NaiveGrouper (class in data_juicer.ops.grouper)
NaiveReverseGrouper (class in data_juicer.ops.grouper)
NestedAggregator (class in data_juicer.ops.aggregator)
NestedDataset (class in data_juicer.core)
(class in data_juicer.core.data)
NlpaugEnMapper (class in data_juicer.ops.mapper)
NlpcdaZhMapper (class in data_juicer.ops.mapper)
null_value (data_juicer.format.EmptyFormatter property)
(data_juicer.format.RayEmptyFormatter property)
O
OptimizeQAMapper (class in data_juicer.ops.mapper)
OptimizeQueryMapper (class in data_juicer.ops.mapper)
OptimizeResponseMapper (class in data_juicer.ops.mapper)
OverallAnalysis (class in data_juicer.analysis)
P
PairPreferenceMapper (class in data_juicer.ops.mapper)
ParquetFormatter (class in data_juicer.format)
parse_output() (data_juicer.ops.aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.MetaTagsAggregator method)
(data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method)
(data_juicer.ops.aggregator.NestedAggregator method)
(data_juicer.ops.filter.LLMDifficultyScoreFilter method)
(data_juicer.ops.filter.LLMQualityScoreFilter method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.CalibrateQueryMapper method)
(data_juicer.ops.mapper.CalibrateResponseMapper method)
(data_juicer.ops.mapper.DialogIntentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper method)
(data_juicer.ops.mapper.DialogTopicDetectionMapper method)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.OptimizeQueryMapper method)
(data_juicer.ops.mapper.OptimizeResponseMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.RelationIdentityMapper method)
PerplexityFilter (class in data_juicer.ops.filter)
PhraseGroundingRecallFilter (class in data_juicer.ops.filter)
prepare_side_configs() (in module data_juicer.config)
probe_small_batch() (data_juicer.core.Adapter method)
process() (data_juicer.core.data.DJDataset method)
(data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
(data_juicer.ops.Deduplicator method)
(data_juicer.ops.deduplicator.DocumentDeduplicator method)
(data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method)
(data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method)
(data_juicer.ops.deduplicator.ImageDeduplicator method)
(data_juicer.ops.deduplicator.VideoDeduplicator method)
(data_juicer.ops.Grouper method)
(data_juicer.ops.grouper.KeyValueGrouper method)
(data_juicer.ops.grouper.NaiveGrouper method)
(data_juicer.ops.grouper.NaiveReverseGrouper method)
(data_juicer.ops.Selector method)
(data_juicer.ops.selector.FrequencySpecifiedFieldSelector method)
(data_juicer.ops.selector.RandomSelector method)
(data_juicer.ops.selector.RangeSpecifiedFieldSelector method)
(data_juicer.ops.selector.TagsSpecifiedFieldSelector method)
(data_juicer.ops.selector.TopkSpecifiedFieldSelector method)
process_batched() (data_juicer.ops.Filter method)
(data_juicer.ops.filter.AlphanumericFilter method)
(data_juicer.ops.filter.AverageLineLengthFilter method)
(data_juicer.ops.filter.CharacterRepetitionFilter method)
(data_juicer.ops.filter.FlaggedWordFilter method)
(data_juicer.ops.filter.ImageAspectRatioFilter method)
(data_juicer.ops.filter.MaximumLineLengthFilter method)
(data_juicer.ops.filter.PerplexityFilter method)
(data_juicer.ops.filter.SpecialCharactersFilter method)
(data_juicer.ops.filter.TextLengthFilter method)
(data_juicer.ops.filter.WordRepetitionFilter method)
(data_juicer.ops.filter.WordsNumFilter method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.mapper.ChineseConvertMapper method)
(data_juicer.ops.mapper.CleanCopyrightMapper method)
(data_juicer.ops.mapper.CleanEmailMapper method)
(data_juicer.ops.mapper.CleanHtmlMapper method)
(data_juicer.ops.mapper.CleanIpMapper method)
(data_juicer.ops.mapper.CleanLinksMapper method)
(data_juicer.ops.mapper.ExpandMacroMapper method)
(data_juicer.ops.mapper.ExtractEventMapper method)
(data_juicer.ops.mapper.FixUnicodeMapper method)
(data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method)
(data_juicer.ops.mapper.ImageCaptioningMapper method)
(data_juicer.ops.mapper.ImageDiffusionMapper method)
(data_juicer.ops.mapper.NlpaugEnMapper method)
(data_juicer.ops.mapper.NlpcdaZhMapper method)
(data_juicer.ops.mapper.PunctuationNormalizationMapper method)
(data_juicer.ops.mapper.PythonFileMapper method)
(data_juicer.ops.mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.QueryIntentDetectionMapper method)
(data_juicer.ops.mapper.QuerySentimentDetectionMapper method)
(data_juicer.ops.mapper.QueryTopicDetectionMapper method)
(data_juicer.ops.mapper.RemoveBibliographyMapper method)
(data_juicer.ops.mapper.RemoveCommentsMapper method)
(data_juicer.ops.mapper.RemoveHeaderMapper method)
(data_juicer.ops.mapper.RemoveLongWordsMapper method)
(data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper method)
(data_juicer.ops.mapper.RemoveRepeatSentencesMapper method)
(data_juicer.ops.mapper.RemoveSpecificCharsMapper method)
(data_juicer.ops.mapper.RemoveTableTextMapper method)
(data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method)
(data_juicer.ops.mapper.ReplaceContentMapper method)
(data_juicer.ops.mapper.SentenceSplitMapper method)
(data_juicer.ops.mapper.TextChunkMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromAudioMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromFramesMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper method)
(data_juicer.ops.mapper.VideoCaptioningFromVideoMapper method)
(data_juicer.ops.mapper.VideoSplitByDurationMapper method)
(data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
(data_juicer.ops.mapper.WhitespaceNormalizationMapper method)
process_single() (data_juicer.ops.Aggregator method)
(data_juicer.ops.aggregator.EntityAttributeAggregator method)
(data_juicer.ops.aggregator.MetaTagsAggregator method)
(data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method)
(data_juicer.ops.aggregator.NestedAggregator method)
(data_juicer.ops.deduplicator.RayBasicDeduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.filter.AudioDurationFilter method)
(data_juicer.ops.filter.AudioNMFSNRFilter method)
(data_juicer.ops.filter.AudioSizeFilter method)
(data_juicer.ops.filter.ImageAestheticsFilter method)
(data_juicer.ops.filter.ImageFaceCountFilter method)
(data_juicer.ops.filter.ImageFaceRatioFilter method)
(data_juicer.ops.filter.ImageNSFWFilter method)
(data_juicer.ops.filter.ImagePairSimilarityFilter method)
(data_juicer.ops.filter.ImageShapeFilter method)
(data_juicer.ops.filter.ImageSizeFilter method)
(data_juicer.ops.filter.ImageTextMatchingFilter method)
(data_juicer.ops.filter.ImageTextSimilarityFilter method)
(data_juicer.ops.filter.ImageWatermarkFilter method)
(data_juicer.ops.filter.LanguageIDScoreFilter method)
(data_juicer.ops.filter.LLMDifficultyScoreFilter method)
(data_juicer.ops.filter.LLMQualityScoreFilter method)
(data_juicer.ops.filter.PhraseGroundingRecallFilter method)
(data_juicer.ops.filter.SpecifiedFieldFilter method)
(data_juicer.ops.filter.SpecifiedNumericFieldFilter method)
(data_juicer.ops.filter.StopWordsFilter method)
(data_juicer.ops.filter.SuffixFilter method)
(data_juicer.ops.filter.TextActionFilter method)
(data_juicer.ops.filter.TextEntityDependencyFilter method)
(data_juicer.ops.filter.TextPairSimilarityFilter method)
(data_juicer.ops.filter.TokenNumFilter method)
(data_juicer.ops.filter.VideoAestheticsFilter method)
(data_juicer.ops.filter.VideoAspectRatioFilter method)
(data_juicer.ops.filter.VideoDurationFilter method)
(data_juicer.ops.filter.VideoFramesTextSimilarityFilter method)
(data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoNSFWFilter method)
(data_juicer.ops.filter.VideoOcrAreaRatioFilter method)
(data_juicer.ops.filter.VideoResolutionFilter method)
(data_juicer.ops.filter.VideoTaggingFromFramesFilter method)
(data_juicer.ops.filter.VideoWatermarkFilter method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.mapper.AudioAddGaussianNoiseMapper method)
(data_juicer.ops.mapper.AudioFFmpegWrappedMapper method)
(data_juicer.ops.mapper.CalibrateQAMapper method)
(data_juicer.ops.mapper.DialogIntentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentDetectionMapper method)
(data_juicer.ops.mapper.DialogSentimentIntensityMapper method)
(data_juicer.ops.mapper.DialogTopicDetectionMapper method)
(data_juicer.ops.mapper.ExtractEntityAttributeMapper method)
(data_juicer.ops.mapper.ExtractEntityRelationMapper method)
(data_juicer.ops.mapper.ExtractKeywordMapper method)
(data_juicer.ops.mapper.ExtractNicknameMapper method)
(data_juicer.ops.mapper.ExtractSupportTextMapper method)
(data_juicer.ops.mapper.ExtractTablesFromHtmlMapper method)
(data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.ImageBlurMapper method)
(data_juicer.ops.mapper.ImageFaceBlurMapper method)
(data_juicer.ops.mapper.ImageRemoveBackgroundMapper method)
(data_juicer.ops.mapper.ImageSegmentMapper method)
(data_juicer.ops.mapper.ImageTaggingMapper method)
(data_juicer.ops.mapper.MllmMapper method)
(data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.PairPreferenceMapper method)
(data_juicer.ops.mapper.PythonFileMapper method)
(data_juicer.ops.mapper.PythonLambdaMapper method)
(data_juicer.ops.mapper.RelationIdentityMapper method)
(data_juicer.ops.mapper.SDXLPrompt2PromptMapper method)
(data_juicer.ops.mapper.SentenceAugmentationMapper method)
(data_juicer.ops.mapper.VideoExtractFramesMapper method)
(data_juicer.ops.mapper.VideoFaceBlurMapper method)
(data_juicer.ops.mapper.VideoFFmpegWrappedMapper method)
(data_juicer.ops.mapper.VideoRemoveWatermarkMapper method)
(data_juicer.ops.mapper.VideoResizeAspectRatioMapper method)
(data_juicer.ops.mapper.VideoResizeResolutionMapper method)
(data_juicer.ops.mapper.VideoSplitBySceneMapper method)
(data_juicer.ops.mapper.VideoTaggingFromAudioMapper method)
(data_juicer.ops.mapper.VideoTaggingFromFramesMapper method)
PunctuationNormalizationMapper (class in data_juicer.ops.mapper)
PythonFileMapper (class in data_juicer.ops.mapper)
PythonLambdaMapper (class in data_juicer.ops.mapper)
Q
query_most_relevant_entities() (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method)
QueryIntentDetectionMapper (class in data_juicer.ops.mapper)
QuerySentimentDetectionMapper (class in data_juicer.ops.mapper)
QueryTopicDetectionMapper (class in data_juicer.ops.mapper)
R
RandomSelector (class in data_juicer.ops.selector)
RangeSpecifiedFieldSelector (class in data_juicer.ops.selector)
RayBasicDeduplicator (class in data_juicer.ops.deduplicator)
RayBTSMinhashDeduplicator (class in data_juicer.ops.deduplicator)
RayDocumentDeduplicator (class in data_juicer.ops.deduplicator)
RayEmptyFormatter (class in data_juicer.format)
RayImageDeduplicator (class in data_juicer.ops.deduplicator)
RayVideoDeduplicator (class in data_juicer.ops.deduplicator)
recursive_summary() (data_juicer.ops.aggregator.NestedAggregator method)
recursively_chunk() (data_juicer.ops.mapper.TextChunkMapper method)
refine_single_column() (data_juicer.analysis.OverallAnalysis method)
RelationIdentityMapper (class in data_juicer.ops.mapper)
RemoteFormatter (class in data_juicer.format)
remove_columns() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
RemoveBibliographyMapper (class in data_juicer.ops.mapper)
RemoveCommentsMapper (class in data_juicer.ops.mapper)
RemoveHeaderMapper (class in data_juicer.ops.mapper)
RemoveLongWordsMapper (class in data_juicer.ops.mapper)
RemoveNonChineseCharacterlMapper (class in data_juicer.ops.mapper)
RemoveRepeatSentencesMapper (class in data_juicer.ops.mapper)
RemoveSpecificCharsMapper (class in data_juicer.ops.mapper)
RemoveTableTextMapper (class in data_juicer.ops.mapper)
RemoveWordsWithIncorrectSubstringsMapper (class in data_juicer.ops.mapper)
ReplaceContentMapper (class in data_juicer.ops.mapper)
run() (data_juicer.core.Analyzer method)
(data_juicer.core.DefaultExecutor method)
(data_juicer.core.executor.DefaultExecutor method)
(data_juicer.core.executor.ExecutorBase method)
(data_juicer.core.ExecutorBase method)
(data_juicer.ops.Aggregator method)
(data_juicer.ops.Deduplicator method)
(data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method)
(data_juicer.ops.Filter method)
(data_juicer.ops.Grouper method)
(data_juicer.ops.Mapper method)
(data_juicer.ops.Selector method)
S
sample_data() (data_juicer.core.DefaultExecutor method)
(data_juicer.core.executor.DefaultExecutor method)
schema() (data_juicer.core.data.DJDataset method)
(data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
SDXLPrompt2PromptMapper (class in data_juicer.ops.mapper)
select() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
select_columns() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
Selector (class in data_juicer.ops)
SentenceAugmentationMapper (class in data_juicer.ops.mapper)
SentenceSplitMapper (class in data_juicer.ops.mapper)
setup_model() (data_juicer.ops.filter.VideoMotionScoreFilter method)
(data_juicer.ops.filter.VideoMotionScoreRaftFilter method)
should_keep_long_word() (data_juicer.ops.mapper.RemoveLongWordsMapper method)
should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper method)
SpecialCharactersFilter (class in data_juicer.ops.filter)
SpecifiedFieldFilter (class in data_juicer.ops.filter)
SpecifiedNumericFieldFilter (class in data_juicer.ops.filter)
split_on_newline_tab_whitespace() (in module data_juicer.ops.common)
split_on_whitespace() (in module data_juicer.ops.common)
split_text_by_punctuation() (in module data_juicer.ops.common)
split_videos_by_duration() (data_juicer.ops.mapper.VideoSplitByDurationMapper method)
StopWordsFilter (class in data_juicer.ops.filter)
STRATEGY (data_juicer.ops.mapper.VideoResizeAspectRatioMapper attribute)
strip() (in module data_juicer.ops.common)
SUFFIXES (data_juicer.format.CsvFormatter attribute)
(data_juicer.format.EmptyFormatter attribute)
(data_juicer.format.JsonFormatter attribute)
(data_juicer.format.ParquetFormatter attribute)
(data_juicer.format.RayEmptyFormatter attribute)
(data_juicer.format.TextFormatter attribute)
(data_juicer.format.TsvFormatter attribute)
SuffixFilter (class in data_juicer.ops.filter)
T
TagsSpecifiedFieldSelector (class in data_juicer.ops.selector)
take_batch() (data_juicer.core.Adapter static method)
TextActionFilter (class in data_juicer.ops.filter)
TextChunkMapper (class in data_juicer.ops.mapper)
TextEntityDependencyFilter (class in data_juicer.ops.filter)
TextFormatter (class in data_juicer.format)
TextLengthFilter (class in data_juicer.ops.filter)
TextPairSimilarityFilter (class in data_juicer.ops.filter)
TiB (data_juicer.core.Exporter attribute)
to_json() (data_juicer.core.Exporter static method)
to_jsonl() (data_juicer.core.Exporter static method)
to_parquet() (data_juicer.core.Exporter static method)
TokenNumFilter (class in data_juicer.ops.filter)
TopkSpecifiedFieldSelector (class in data_juicer.ops.selector)
trace_batch_mapper() (data_juicer.core.Tracer method)
trace_deduplicator() (data_juicer.core.Tracer method)
trace_filter() (data_juicer.core.Tracer method)
trace_mapper() (data_juicer.core.Tracer method)
Tracer (class in data_juicer.core)
TsvFormatter (class in data_juicer.format)
U
update_args() (data_juicer.core.data.NestedDataset method)
(data_juicer.core.NestedDataset method)
V
VideoAestheticsFilter (class in data_juicer.ops.filter)
VideoAspectRatioFilter (class in data_juicer.ops.filter)
VideoCaptioningFromAudioMapper (class in data_juicer.ops.mapper)
VideoCaptioningFromFramesMapper (class in data_juicer.ops.mapper)
VideoCaptioningFromSummarizerMapper (class in data_juicer.ops.mapper)
VideoCaptioningFromVideoMapper (class in data_juicer.ops.mapper)
VideoDeduplicator (class in data_juicer.ops.deduplicator)
VideoDurationFilter (class in data_juicer.ops.filter)
VideoExtractFramesMapper (class in data_juicer.ops.mapper)
VideoFaceBlurMapper (class in data_juicer.ops.mapper)
VideoFFmpegWrappedMapper (class in data_juicer.ops.mapper)
VideoFramesTextSimilarityFilter (class in data_juicer.ops.filter)
VideoMotionScoreFilter (class in data_juicer.ops.filter)
VideoMotionScoreRaftFilter (class in data_juicer.ops.filter)
VideoNSFWFilter (class in data_juicer.ops.filter)
VideoOcrAreaRatioFilter (class in data_juicer.ops.filter)
VideoRemoveWatermarkMapper (class in data_juicer.ops.mapper)
VideoResizeAspectRatioMapper (class in data_juicer.ops.mapper)
VideoResizeResolutionMapper (class in data_juicer.ops.mapper)
VideoResolutionFilter (class in data_juicer.ops.filter)
VideoSplitByDurationMapper (class in data_juicer.ops.mapper)
VideoSplitByKeyFrameMapper (class in data_juicer.ops.mapper)
VideoSplitBySceneMapper (class in data_juicer.ops.mapper)
VideoTaggingFromAudioMapper (class in data_juicer.ops.mapper)
VideoTaggingFromFramesFilter (class in data_juicer.ops.filter)
VideoTaggingFromFramesMapper (class in data_juicer.ops.mapper)
VideoWatermarkFilter (class in data_juicer.ops.filter)
W
WhitespaceNormalizationMapper (class in data_juicer.ops.mapper)
WordRepetitionFilter (class in data_juicer.ops.filter)
words_augmentation() (in module data_juicer.ops.common)
words_refinement() (in module data_juicer.ops.common)
WordsNumFilter (class in data_juicer.ops.filter)
wrap_func_with_nested_access() (in module data_juicer.core.data)