Source code for trinity.common.models.mm_utils

""""Multi-modal utilities for processing and handling multi-modal data such as images and videos.
Only support Qwen2.5 VL series.

Modified from: verl/utils/dataset/rl_dataset.py
"""
import re
from typing import Any, Dict, List

import numpy as np
from PIL import Image


[docs] def build_multi_modal_inputs( prompt: str, images: List[Image.Image], videos: List[np.ndarray], processor: Any, ) -> Dict[str, Any]: """ Preprocess multi-modal data and build multi-modal inputs """ if prompt is None: raise ValueError("Prompt is required for build multi-modal inputs") multi_modal_data = {} if images: multi_modal_data["image"] = images if videos: multi_modal_data["video"] = videos model_inputs = processor( text=[prompt], images=multi_modal_data.get("image", None), videos=multi_modal_data.get("video", None), return_tensors="pt", ) input_ids = model_inputs.pop("input_ids")[0] model_inputs.pop("attention_mask") if "second_per_grid_ts" in model_inputs: model_inputs.pop("second_per_grid_ts") return { "prompt": prompt, "prompt_token_ids": input_ids, "multi_modal_data": multi_modal_data, "multi_modal_inputs": dict(model_inputs), }
[docs] def convert_messages_to_mm_format(messages: List[Dict]) -> List[Dict]: for message in messages: content = message["content"] content_list = [] segments = re.split("(<image>|<video>)", content) segments = [item for item in segments if item != ""] for segment in segments: if segment == "<image>": content_list.append( {"type": "image"} ) # chat template will fill the actual image data later elif segment == "<video>": content_list.append( {"type": "video"} ) # chat template will fill the actual video data later elif len(segment) == 0: continue else: content_list.append({"type": "text", "text": segment}) message["content"] = content_list return messages