Source code for data_juicer.ops.mapper.mllm_mapper

from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.ops.op_fusion import LOADED_IMAGES
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_image
from data_juicer.utils.model_utils import get_model, prepare_model

OP_NAME = 'mllm_mapper'

torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')
torch.set_num_threads(1)


[docs] @LOADED_IMAGES.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class MllmMapper(Mapper): """Mapper to use MLLMs for visual question answering tasks. Recommended model list: [ llava-hf/llava-v1.6-vicuna-7b-hf, Qwen/Qwen2-VL-7B-Instruct, ] """ _accelerator = 'cuda'
[docs] def __init__(self, hf_model: str = 'llava-hf/llava-v1.6-vicuna-7b-hf', max_new_tokens=256, temperature=0.2, top_p=None, num_beams=1, *args, **kwargs): """ Initialization method. :param hf_model: hugginface model id. :param max_new_tokens: the maximum number of new tokens generated by the model. :param temperature: used to control the randomness of \ generated text. The higher the temperature, the more \ random and creative the generated text will be. :param top_p: randomly select the next word from the group \ of words whose cumulative probability reaches p. :param num_beams: the larger the beam search size, the higher \ the quality of the generated text. :param args: extra args :param kwargs: extra args """ kwargs.setdefault('mem_required', '32GB') kwargs.setdefault('num_proc', 1) super().__init__(*args, **kwargs) self.hf_model = hf_model self.model_key = prepare_model(model_type='huggingface', pretrained_model_name_or_path=hf_model) self.max_new_tokens = max_new_tokens self.temperature = temperature self.top_p = top_p self.num_beams = num_beams
[docs] def process_single(self, sample=None, rank=None): # there is no image in this sample if self.image_key not in sample or not sample[self.image_key]: return sample # load images loaded_image_keys = sample[self.image_key] images = {} for loaded_image_key in loaded_image_keys: if loaded_image_key not in images: # avoid loading the same images image = load_image(loaded_image_key) images[loaded_image_key] = image model, processor = get_model(model_key=self.model_key, rank=rank, use_cuda=self.use_cuda()) conversation = [ { 'role': 'user', 'content': [ { 'type': 'text', 'text': sample[self.text_key] }, { 'type': 'image' }, ], }, ] prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) sample[self.text_key] = [] for image_key in images: inputs = processor(images=images[image_key], text=prompt, return_tensors='pt').to(model.device) response = model.generate(**inputs, max_new_tokens=self.max_new_tokens, temperature=self.temperature, top_p=self.top_p, num_beams=self.num_beams) output = processor.decode(response.cpu()[0], skip_special_tokens=True) sample[self.text_key].append(output) return sample