import os
from typing import Optional
from jsonargparse import Namespace
from loguru import logger
from pydantic import PositiveInt
from data_juicer.analysis import ColumnWiseAnalysis, OverallAnalysis
from data_juicer.config import init_configs
from data_juicer.format import load_formatter
from data_juicer.ops import Filter, load_ops
from data_juicer.utils import cache_utils
from .exporter import Exporter
[docs]class Analyzer:
"""
This Analyzer class is used to analyze a specific dataset.
It will compute stats for all filter ops in the config file, apply
multiple analysis (e.g. OverallAnalysis, ColumnWiseAnalysis, etc.)
on these stats, and generate the analysis results (stats tables,
distribution figures, etc.) to help users understand the input
dataset better.
"""
[docs] def __init__(self, cfg: Optional[Namespace] = None):
"""
Initialization method.
:param cfg: optional jsonargparse Namespace dict.
"""
self.cfg = init_configs() if cfg is None else cfg
self.work_dir = self.cfg.work_dir
if self.cfg.use_cache:
logger.info(f'Using cache compression method: '
f'[{self.cfg.cache_compress}]')
cache_utils.CACHE_COMPRESS = self.cfg.cache_compress
# setup formatter
logger.info('Setting up data formatter...')
self.formatter = load_formatter(
dataset_path=self.cfg.dataset_path,
generated_dataset_config=self.cfg.generated_dataset_config,
text_keys=self.cfg.text_keys,
suffixes=self.cfg.suffixes,
add_suffix=self.cfg.add_suffix)
# prepare exporter and check export path suffix
# NOTICE: no need to export dataset texts for analyzer
# (export_ds=False). Instead, only need to export stats
# (export_stats=True).
logger.info('Preparing exporter...')
self.exporter = Exporter(
self.cfg.export_path,
self.cfg.export_shard_size,
self.cfg.export_in_parallel,
self.cfg.np,
export_ds=self.cfg.export_original_dataset,
keep_stats_in_res_ds=self.cfg.export_original_dataset,
export_stats=True)
# parsed_res
self.overall_result = None
self.overall_single_plot_path = None
self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
[docs] def run(self,
load_data_np: Optional[PositiveInt] = None,
skip_export: bool = False,
skip_return: bool = False):
"""
Running the dataset analysis pipeline.
:param load_data_np: number of workers when loading the dataset.
:param skip_export: whether export the results into disk
:param skip_return: skip return for API called.
:return: analyzed dataset.
"""
# 1. format data
logger.info('Loading dataset from data formatter...')
if load_data_np is None:
load_data_np = self.cfg.np
dataset = self.formatter.load_dataset(load_data_np, self.cfg)
# extract processes
logger.info('Preparing process operators...')
ops = load_ops(self.cfg.process, self.cfg.op_fusion)
# 2. stats precompute only for filter ops
logger.info('Computing the stats of dataset...')
stats_collected = False
for op in ops:
if isinstance(op, Filter):
original_process = op.process
op.process = None
dataset = dataset.process(op, work_dir=self.work_dir)
op.process = original_process
stats_collected = True
if not stats_collected:
logger.warning('No stats collected. Please add some Filter ops to '
'the process list in configs.')
return dataset
# 3. data export
logger.info('Exporting dataset to disk...')
self.exporter.export(dataset)
if self.cfg.use_cache and self.cfg.cache_compress:
from data_juicer.utils.compress import compress
compress(dataset)
# 4. analysis and output result to the export path
# 4.1. Only consider fields in Fields.stats
# 4.2. For string fields, only consider its histogram
# 4.3. For numeric fields, consider its histogram and box
# 4.4. Otherwise, DO NOT analyze
logger.info('Applying overall analysis on stats...')
overall_analysis = OverallAnalysis(dataset, self.analysis_path)
self.overall_result = overall_analysis.analyze(
percentiles=self.cfg.percentiles,
num_proc=self.cfg.np,
skip_export=skip_export)
logger.info(f'The overall analysis results are: {self.overall_result}')
logger.info('Applying column-wise analysis on stats...')
column_wise_analysis = ColumnWiseAnalysis(
dataset,
self.analysis_path,
overall_result=self.overall_result,
save_stats_in_one_file=self.cfg.save_stats_in_one_file,
)
column_wise_analysis.analyze(skip_export=skip_export)
if not skip_return:
return dataset