-
Notifications
You must be signed in to change notification settings - Fork 31.7k
Open
Labels
Description
System Info
import os
os.environ['VLLM_USE_V1'] = '1'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
# os.environ['VLLM_ATTENTION_BACKEND'] = 'XFORMERS'
import torch
import warnings
import numpy as np
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
from qwen_omni_utils import process_mm_info
from transformers import Qwen3OmniMoeProcessor
MODEL_PATH = "output/qwen3-omni-audio/merged"
USE_TRANSFORMERS = True
TRANSFORMERS_USE_FLASH_ATTN2 = False
USE_AUDIO_IN_VIDEO = False
RETURN_AUDIO = True
def _load_model_processor():
if USE_TRANSFORMERS:
from transformers import Qwen3OmniMoeForConditionalGeneration
if TRANSFORMERS_USE_FLASH_ATTN2:
model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
MODEL_PATH,
dtype='auto',
attn_implementation='flash_attention_2',
device_map="auto"
)
model.enable_talker()
else:
model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
MODEL_PATH,
device_map="auto",
dtype='auto'
)
model.enable_talker()
else:
from vllm import LLM
model = LLM(
model=MODEL_PATH,
trust_remote_code=True,
gpu_memory_utilization=0.95,
tensor_parallel_size=torch.cuda.device_count(),
limit_mm_per_prompt={'image': 1, 'video': 3, 'audio': 3},
max_num_seqs=1,
max_model_len=32768,
seed=1234,
)
processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
return model, processor
def run_model(model, processor, messages, return_audio, use_audio_in_video):
if USE_TRANSFORMERS:
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
inputs = processor(text=text, audio=audios, images=images, videos=videos,
return_tensors="pt", padding=True, use_audio_in_video=use_audio_in_video)
inputs = inputs.to(model.device).to(model.dtype)
text_ids, audio = model.generate(
**inputs,
thinker_return_dict_in_generate=True,
thinker_max_new_tokens=8192,
thinker_do_sample=False,
speaker="Ethan",
use_audio_in_video=use_audio_in_video,
return_audio=return_audio
)
response = processor.batch_decode(
text_ids.sequences[:, inputs["input_ids"].shape[1]:],
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
if audio is not None:
audio = np.array(audio.reshape(-1).detach().cpu().numpy() * 32767).astype(np.int16)
return response, audio
else:
from vllm import SamplingParams
sampling_params = SamplingParams(temperature=1e-2, top_p=0.1, top_k=1, max_tokens=8192)
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
inputs = {
'prompt': text,
'multi_modal_data': {},
"mm_processor_kwargs": {"use_audio_in_video": use_audio_in_video}
}
if images is not None: inputs['multi_modal_data']['image'] = images
if videos is not None: inputs['multi_modal_data']['video'] = videos
if audios is not None: inputs['multi_modal_data']['audio'] = audios
outputs = model.generate(inputs, sampling_params=sampling_params)
response = outputs[0].outputs[0].text
return response, None
import scipy.io.wavfile as wavfile
if __name__ == '__main__':
from multiprocessing import freeze_support
freeze_support()
# Load model
model, processor = _load_model_processor()
# Test message
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Say hello?"}
]
}
]
# Run inference
response, audio = run_model(
model=model,
messages=messages,
processor=processor,
return_audio=RETURN_AUDIO,
use_audio_in_video=USE_AUDIO_IN_VIDEO
)
print("Response:", response)
if audio is not None:
output_path = "output_audio.wav"
sample_rate = 24000
wavfile.write(output_path, sample_rate, audio)
print(f"Save success: {output_path}")
else:
print("No audio generate.")ValueError: Cannot use talker when talker module not initialized. Use `enable_talker` method or set enable_talker in config to enable talker.
packages:
transformers>=4.57.0
Set model.enable_talker() will ge tother issues:
AttributeError: 'Qwen3OmniMoeTalkerTextConfig' object has no attribute 'shared_expert_intermediate_size'
Who can help?
No response
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
Run python code above
Expected behavior
...