build

Data Build Module - core data pipeline orchestrator for end-to-end data processing. Coordinates loading, processing, annotation, and export stages with flexible configuration.

`DataBuilder`

Bases: BaseDataModule

Main pipeline orchestrator that coordinates all data processing stages.

Manages the complete data workflow from raw input to final export format, executing each stage in sequence while maintaining data integrity and logging.

Attributes:

Name	Type	Description
`load_module`	`Optional[DataLoader]`	Optional data loading component for ingesting external data
`process_module`	`Optional[DataProcessor]`	Optional processing component for filtering and transforming data
`annotation_module`	`Optional[DataAnnotator]`	Optional annotation component for adding labels and metadata
`export_module`	`Optional[DataExporter]`	Optional export component for outputting data in target formats

Source code in rm_gallery/core/data/build.py

class DataBuilder(BaseDataModule):
    """
    Main pipeline orchestrator that coordinates all data processing stages.

    Manages the complete data workflow from raw input to final export format,
    executing each stage in sequence while maintaining data integrity and logging.

    Attributes:
        load_module: Optional data loading component for ingesting external data
        process_module: Optional processing component for filtering and transforming data
        annotation_module: Optional annotation component for adding labels and metadata
        export_module: Optional export component for outputting data in target formats
    """

    load_module: Optional[DataLoader] = Field(default=None)
    process_module: Optional[DataProcessor] = Field(default=None)
    annotation_module: Optional[DataAnnotator] = Field(default=None)
    export_module: Optional[DataExporter] = Field(default=None)

    def __init__(
        self,
        name: str,
        config: Optional[Dict[str, Any]] = None,
        metadata: Optional[Dict[str, Any]] = None,
        **modules,
    ):
        """
        Initialize the data build pipeline with specified modules.

        Args:
            name: Unique identifier for the pipeline instance
            config: Pipeline-level configuration parameters
            metadata: Additional metadata for tracking and debugging
            **modules: Keyword arguments for individual pipeline modules
        """
        super().__init__(
            module_type=DataModuleType.BUILD,
            name=name,
            config=config,
            metadata=metadata,
            **modules,
        )

    def run(
        self, input_data: Union[BaseDataSet, List[DataSample], None] = None, **kwargs
    ) -> BaseDataSet:
        """
        Execute the complete data processing pipeline with all configured stages.

        Processes data through sequential stages: loading → processing → annotation → export.
        Each stage is optional and only executed if the corresponding module is configured.

        Args:
            input_data: Initial dataset, list of samples, or None for load-only pipelines
            **kwargs: Additional runtime parameters passed to individual modules

        Returns:
            Final processed dataset after all stages complete

        Raises:
            Exception: If any pipeline stage fails, with detailed error logging
        """
        try:
            current_data = input_data
            logger.info(f"Starting data build pipeline: {self.name}")

            # Define pipeline stages with human-readable names
            stages = [
                ("Loading", self.load_module),
                ("Processing", self.process_module),
                ("Annotation", self.annotation_module),
                ("Export", self.export_module),
            ]

            for stage_name, module in stages:
                if module:
                    logger.info(f"Stage: {stage_name}")
                    current_data = module.run(current_data)
                    logger.info(f"{stage_name} completed: {len(current_data)} items")

            logger.info(f"Pipeline completed: {len(current_data)} items processed")
            return current_data

        except Exception as e:
            logger.error(f"Pipeline execution failed: {str(e)}")
            raise e

`init(name, config=None, metadata=None, **modules)`

Initialize the data build pipeline with specified modules.

Parameters:

Name	Type	Description	Default
`name`	`str`	Unique identifier for the pipeline instance	required
`config`	`Optional[Dict[str, Any]]`	Pipeline-level configuration parameters	`None`
`metadata`	`Optional[Dict[str, Any]]`	Additional metadata for tracking and debugging	`None`
`**modules`		Keyword arguments for individual pipeline modules	`{}`

Source code in rm_gallery/core/data/build.py

def __init__(
    self,
    name: str,
    config: Optional[Dict[str, Any]] = None,
    metadata: Optional[Dict[str, Any]] = None,
    **modules,
):
    """
    Initialize the data build pipeline with specified modules.

    Args:
        name: Unique identifier for the pipeline instance
        config: Pipeline-level configuration parameters
        metadata: Additional metadata for tracking and debugging
        **modules: Keyword arguments for individual pipeline modules
    """
    super().__init__(
        module_type=DataModuleType.BUILD,
        name=name,
        config=config,
        metadata=metadata,
        **modules,
    )

`run(input_data=None, **kwargs)`

Execute the complete data processing pipeline with all configured stages.

Processes data through sequential stages: loading → processing → annotation → export. Each stage is optional and only executed if the corresponding module is configured.

Parameters:

Name	Type	Description	Default
`input_data`	`Union[BaseDataSet, List[DataSample], None]`	Initial dataset, list of samples, or None for load-only pipelines	`None`
`**kwargs`		Additional runtime parameters passed to individual modules	`{}`

Returns:

Type	Description
`BaseDataSet`	Final processed dataset after all stages complete

Raises:

Type	Description
`Exception`	If any pipeline stage fails, with detailed error logging

Source code in rm_gallery/core/data/build.py

def run(
    self, input_data: Union[BaseDataSet, List[DataSample], None] = None, **kwargs
) -> BaseDataSet:
    """
    Execute the complete data processing pipeline with all configured stages.

    Processes data through sequential stages: loading → processing → annotation → export.
    Each stage is optional and only executed if the corresponding module is configured.

    Args:
        input_data: Initial dataset, list of samples, or None for load-only pipelines
        **kwargs: Additional runtime parameters passed to individual modules

    Returns:
        Final processed dataset after all stages complete

    Raises:
        Exception: If any pipeline stage fails, with detailed error logging
    """
    try:
        current_data = input_data
        logger.info(f"Starting data build pipeline: {self.name}")

        # Define pipeline stages with human-readable names
        stages = [
            ("Loading", self.load_module),
            ("Processing", self.process_module),
            ("Annotation", self.annotation_module),
            ("Export", self.export_module),
        ]

        for stage_name, module in stages:
            if module:
                logger.info(f"Stage: {stage_name}")
                current_data = module.run(current_data)
                logger.info(f"{stage_name} completed: {len(current_data)} items")

        logger.info(f"Pipeline completed: {len(current_data)} items processed")
        return current_data

    except Exception as e:
        logger.error(f"Pipeline execution failed: {str(e)}")
        raise e

`create_builder(name, config=None, **modules)`

Factory function to create a data build module with specified configuration.

Parameters:

Name	Type	Description	Default
`name`	`str`	Unique identifier for the pipeline	required
`config`	`Optional[Dict[str, Any]]`	Pipeline configuration parameters	`None`
`**modules`		Individual module instances to include in the pipeline	`{}`

Returns:

Type	Description
`DataBuilder`	Configured DataBuilder instance ready for execution

Source code in rm_gallery/core/data/build.py

def create_builder(
    name: str, config: Optional[Dict[str, Any]] = None, **modules
) -> DataBuilder:
    """
    Factory function to create a data build module with specified configuration.

    Args:
        name: Unique identifier for the pipeline
        config: Pipeline configuration parameters
        **modules: Individual module instances to include in the pipeline

    Returns:
        Configured DataBuilder instance ready for execution
    """
    return DataBuilder(name=name, config=config, **modules)

`create_builder_from_yaml(config_path)`

Create a data build module from YAML configuration file.

Supports comprehensive pipeline configuration including data sources, processing operators, annotation settings, and export formats.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to YAML configuration file	required

Returns:

Type	Description
`DataBuilder`	Fully configured DataBuilder instance based on YAML specification

Raises:

Type	Description
`FileNotFoundError`	If configuration file does not exist
`ValueError`	If configuration format is invalid

Source code in rm_gallery/core/data/build.py

def create_builder_from_yaml(config_path: str) -> DataBuilder:
    """
    Create a data build module from YAML configuration file.

    Supports comprehensive pipeline configuration including data sources,
    processing operators, annotation settings, and export formats.

    Args:
        config_path: Path to YAML configuration file

    Returns:
        Fully configured DataBuilder instance based on YAML specification

    Raises:
        FileNotFoundError: If configuration file does not exist
        ValueError: If configuration format is invalid
    """
    config_path = Path(config_path)
    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_path}")

    config = read_yaml(config_path)

    # Support new dataset structure
    if "dataset" in config:
        return _create_from_dataset_config(config["dataset"])
    else:
        raise ValueError("Invalid configuration file")

build

DataBuilder

__init__(name, config=None, metadata=None, **modules)

run(input_data=None, **kwargs)

create_builder(name, config=None, **modules)

create_builder_from_yaml(config_path)

`DataBuilder`

`init(name, config=None, metadata=None, **modules)`

`run(input_data=None, **kwargs)`

`create_builder(name, config=None, **modules)`

`create_builder_from_yaml(config_path)`