Skip to content

Issue generate audio Qwen3-Omni #43074

@Hert4

Description

@Hert4

System Info

import os
os.environ['VLLM_USE_V1'] = '1'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
# os.environ['VLLM_ATTENTION_BACKEND'] = 'XFORMERS'  

import torch
import warnings
import numpy as np

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

from qwen_omni_utils import process_mm_info
from transformers import Qwen3OmniMoeProcessor


MODEL_PATH = "output/qwen3-omni-audio/merged"
USE_TRANSFORMERS = True 
TRANSFORMERS_USE_FLASH_ATTN2 = False
USE_AUDIO_IN_VIDEO = False
RETURN_AUDIO = True

def _load_model_processor():
    if USE_TRANSFORMERS:
        from transformers import Qwen3OmniMoeForConditionalGeneration
        if TRANSFORMERS_USE_FLASH_ATTN2:
            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
                MODEL_PATH,
                dtype='auto',
                attn_implementation='flash_attention_2',
                device_map="auto"
            )
            model.enable_talker()
        else:
            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
                MODEL_PATH, 
                device_map="auto", 
                dtype='auto'
            )
            model.enable_talker()
    else:
        from vllm import LLM
        model = LLM(
            model=MODEL_PATH, 
            trust_remote_code=True, 
            gpu_memory_utilization=0.95,
            tensor_parallel_size=torch.cuda.device_count(),
            limit_mm_per_prompt={'image': 1, 'video': 3, 'audio': 3},
            max_num_seqs=1,
            max_model_len=32768,
            seed=1234,
        )

    processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
    return model, processor

def run_model(model, processor, messages, return_audio, use_audio_in_video):
    if USE_TRANSFORMERS:
        text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
        inputs = processor(text=text, audio=audios, images=images, videos=videos, 
                          return_tensors="pt", padding=True, use_audio_in_video=use_audio_in_video)
        inputs = inputs.to(model.device).to(model.dtype)
        text_ids, audio = model.generate(
            **inputs, 
            thinker_return_dict_in_generate=True,
            thinker_max_new_tokens=8192, 
            thinker_do_sample=False,
            speaker="Ethan", 
            use_audio_in_video=use_audio_in_video,
            return_audio=return_audio
        )
        response = processor.batch_decode(
            text_ids.sequences[:, inputs["input_ids"].shape[1]:], 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )[0]
        if audio is not None:
            audio = np.array(audio.reshape(-1).detach().cpu().numpy() * 32767).astype(np.int16)
        return response, audio
    else:
        from vllm import SamplingParams
        sampling_params = SamplingParams(temperature=1e-2, top_p=0.1, top_k=1, max_tokens=8192)
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
        inputs = {
            'prompt': text, 
            'multi_modal_data': {}, 
            "mm_processor_kwargs": {"use_audio_in_video": use_audio_in_video}
        }
        if images is not None: inputs['multi_modal_data']['image'] = images
        if videos is not None: inputs['multi_modal_data']['video'] = videos
        if audios is not None: inputs['multi_modal_data']['audio'] = audios
        outputs = model.generate(inputs, sampling_params=sampling_params)
        response = outputs[0].outputs[0].text
        return response, None


import scipy.io.wavfile as wavfile

if __name__ == '__main__':
    from multiprocessing import freeze_support
    freeze_support()
    
    # Load model
    model, processor = _load_model_processor()
    
    # Test message
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Say hello?"}
            ]
        }
    ]
    
    # Run inference
    response, audio = run_model(
        model=model, 
        messages=messages, 
        processor=processor, 
        return_audio=RETURN_AUDIO, 
        use_audio_in_video=USE_AUDIO_IN_VIDEO
    )
    
    print("Response:", response)
    
    if audio is not None:
        output_path = "output_audio.wav"
        sample_rate = 24000  
        wavfile.write(output_path, sample_rate, audio)
        print(f"Save success: {output_path}")
    else:
        print("No audio generate.")
ValueError: Cannot use talker when talker module not initialized. Use `enable_talker` method or set enable_talker in config to enable talker.

packages:

transformers>=4.57.0

Set model.enable_talker() will ge tother issues:

AttributeError: 'Qwen3OmniMoeTalkerTextConfig' object has no attribute 'shared_expert_intermediate_size'

Who can help?

No response

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

Run python code above

Expected behavior

...

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions