[文档]@dataclassclassSchema:"""Dataset schema representation. Attributes: column_types: Mapping of column names to their types columns: List of column names in order """column_types:Dict[str,Any]columns:List[str]
[文档]@classmethoddefmap_hf_type_to_python(cls,feature):"""Map HuggingFace feature type to Python type. Recursively maps nested types (e.g., List[str], Dict[str, int]). Examples: Value('string') -> str Sequence(Value('int32')) -> List[int] Dict({'text': Value('string')}) -> Dict[str, Any] Args: feature: HuggingFace feature type Returns: Corresponding Python type """ifisinstance(feature,Value):# Map Value typestype_mapping={'string':str,'int32':int,'int64':int,'float32':float,'float64':float,'bool':bool,'binary':bytes}returntype_mapping.get(feature.dtype,Any)elifisinstance(feature,(Sequence,Array2D,Array3D)):# Handle sequences/listsreturnlist# Dictionary types - check if it's a dictionary featureelifisinstance(feature,dict)orstr(type(feature)).endswith('Dict'):returndictelifisinstance(feature,ClassLabel):# Handle class labelsreturnintelse:# Default to Any for unknown typesreturnAny
[文档]@classmethoddefmap_ray_type_to_python(cls,ray_type:pa.DataType)->type:"""Map Ray/Arrow data type to Python type. Args: ray_type: PyArrow DataType Returns: Corresponding Python type """# String typesifpa.types.is_string(ray_type):returnstrifpa.types.is_binary(ray_type):returnbytes# Numeric typesifpa.types.is_integer(ray_type):returnintifpa.types.is_floating(ray_type):returnfloat# Booleanifpa.types.is_boolean(ray_type):returnbool# List/Array typesifpa.types.is_list(ray_type):returnlist# Dictionary/Struct typesifpa.types.is_struct(ray_type)orpa.types.is_map(ray_type):returndict# FallbackreturnAny
def__post_init__(self):"""Validate schema after initialization"""# Ensure all columns are in column_typesifnotall(colinself.column_typesforcolinself.columns):missing=set(self.columns)-set(self.column_types.keys())raiseValueError(f'Missing type definitions for columns: {missing}')def__str__(self)->str:"""Return formatted string representation of schema"""lines=['Dataset Schema:']lines.append('-'*40)forcolinself.columns:lines.append(f'{col}: {self.column_types[col]}')return'\n'.join(lines)