请确保 transformers==4.44.2,其他版本目前可能会有兼容性问题,我们正在解决。
如果你使用的低版本的 Pytorch,你可能会遇到这个错误"weight_norm_fwd_first_dim_kernel" not implemented for 'BFloat16', 请在模型初始化的时候添加 self.minicpmo_model.tts.float()
启动web server:
shell
1# Make sure Node and PNPM is installed.2sudoapt-get update
3sudoapt-getinstall nodejs npm4npminstall-gpnpm567cd web_demos/minicpm-o_2.6/web_server
8# 为https创建自签名证书, 要申请浏览器摄像头和麦克风权限须启动https.9bash ./make_ssl_cert.sh # output key.pem and cert.pem1011pnpminstall# install requirements12pnpm run dev # start server
1import torch
2from PIL import Image
3from transformers import AutoModel, AutoTokenizer
45torch.manual_seed(100)67model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,8 attn_implementation='sdpa', torch_dtype=torch.bfloat16)# sdpa or flash_attention_2, no eager9model = model.eval().cuda()10tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)1112image = Image.open('./assets/minicpmo2_6/show_demo.jpg').convert('RGB')1314# First round chat 15question ="What is the landform in the picture?"16msgs =[{'role':'user','content':[image, question]}]1718answer = model.chat(19 msgs=msgs,20 tokenizer=tokenizer
21)22print(answer)2324# Second round chat, pass history context of multi-turn conversation25msgs.append({"role":"assistant","content":[answer]})26msgs.append({"role":"user","content":["What should I pay attention to when traveling here?"]})2728answer = model.chat(29 msgs=msgs,30 tokenizer=tokenizer
31)32print(answer)
你可以得到如下推理结果:
"The landform in the picture is a mountain range. The mountains appear to be karst formations, characterized by their steep, rugged peaks and smooth, rounded shapes. These types of mountains are often found in regions with limestone bedrock and are shaped by processes such as erosion and weathering. The reflection of the mountains in the water adds to the scenic beauty of the landscape."
"When traveling to this scenic location, it's important to pay attention to the weather conditions, as the area appears to be prone to fog and mist, especially during sunrise or sunset. Additionally, ensure you have proper footwear for navigating the potentially slippery terrain around the water. Lastly, respect the natural environment by not disturbing the local flora and fauna."
多图对话
点击查看 MiniCPM-o 2.6 多图输入的 Python 代码。
python
1import torch
2from PIL import Image
3from transformers import AutoModel, AutoTokenizer
45model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,6 attn_implementation='sdpa', torch_dtype=torch.bfloat16)# sdpa or flash_attention_2, no eager7model = model.eval().cuda()8tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)910image1 = Image.open('image1.jpg').convert('RGB')11image2 = Image.open('image2.jpg').convert('RGB')12question ='Compare image 1 and image 2, tell me about the differences between image 1 and image 2.'1314msgs =[{'role':'user','content':[image1, image2, question]}]1516answer = model.chat(17 msgs=msgs,18 tokenizer=tokenizer
19)20print(answer)
1import torch
2from PIL import Image
3from transformers import AutoModel, AutoTokenizer
4from decord import VideoReader, cpu # pip install decord56model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,7 attn_implementation='sdpa', torch_dtype=torch.bfloat16)# sdpa or flash_attention_2, no eager8model = model.eval().cuda()9tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)1011MAX_NUM_FRAMES=64# if cuda OOM set a smaller number1213defencode_video(video_path):14defuniform_sample(l, n):15 gap =len(l)/ n
16 idxs =[int(i * gap + gap /2)for i inrange(n)]17return[l[i]for i in idxs]1819 vr = VideoReader(video_path, ctx=cpu(0))20 sample_fps =round(vr.get_avg_fps()/1)# FPS21 frame_idx =[i for i inrange(0,len(vr), sample_fps)]22iflen(frame_idx)> MAX_NUM_FRAMES:23 frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)24 frames = vr.get_batch(frame_idx).asnumpy()25 frames =[Image.fromarray(v.astype('uint8'))for v in frames]26print('num frames:',len(frames))27return frames
2829video_path="video_test.mp4"30frames = encode_video(video_path)31question ="Describe the video"32msgs =[33{'role':'user','content': frames +[question]},34]3536# Set decode params for video37params ={}38params["use_image_id"]=False39params["max_slice_nums"]=2# use 1 if cuda OOM and video resolution > 448*4484041answer = model.chat(42 msgs=msgs,43 tokenizer=tokenizer,44**params
45)46print(answer)
1mimick_prompt ="Please repeat each user's speech, including voice style and speech content."2audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)3msgs =[{'role':'user','content':[mimick_prompt,audio_input]}]4res = model.chat(5 msgs=msgs,6 tokenizer=tokenizer,7 sampling=True,8 max_new_tokens=128,9 use_tts_template=True,10 temperature=0.3,11 generate_audio=True,12 output_audio_path='output.wav',# save the tts result to output_audio_path13)
可配置声音的语音对话
点击查看个性化配置 MiniCPM-o 2.6 对话声音的 Python 代码。
python
1ref_audio, _ = librosa.load('./assets/voice_01.wav', sr=16000, mono=True)# load the reference audio23# Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt.4sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')5user_question ={'role':'user','content':[librosa.load('xxx.wav', sr=16000, mono=True)[0]]}67# Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant.8# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en') 9# user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something!
1'''
2Audio Understanding Task Prompt:
3Speech:
4 ASR with ZH(same as AST en2zh): 请仔细听这段音频片段,并将其内容逐字记录。
5 ASR with EN(same as AST zh2en): Please listen to the audio snippet carefully and transcribe the content.
6 Speaker Analysis: Based on the speaker's content, speculate on their gender, condition, age range, and health status.
7General Audio:
8 Audio Caption: Summarize the main content of the audio.
9 Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
10'''11task_prompt ="\n"12audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)1314msgs =[{'role':'user','content':[task_prompt,audio_input]}]1516res = model.chat(17 msgs=msgs,18 tokenizer=tokenizer,19 sampling=True,20 max_new_tokens=128,21 use_tts_template=True,22 generate_audio=True,23 temperature=0.3,24 output_audio_path='result.wav',25)26print(res)
python
1'''
2Speech Generation Task Prompt:
3 Human Instruction-to-Speech: see https://voxinstruct.github.io/VoxInstruct/
4 Example:
5 # 在新闻中,一个年轻男性兴致勃勃地说:“祝福亲爱的祖国母亲美丽富强!”他用低音调和低音量,慢慢地说出了这句话。
6 # Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
多模态流式交互
点击查看 MiniCPM-o 2.6 多模态流式交互的 Python 代码。
python
1import math
2import numpy as np
3from PIL import Image
4from moviepy.editor import VideoFileClip
5import tempfile
6import librosa
7import soundfile as sf
8import torch
9from transformers import AutoModel, AutoTokenizer
1011defget_video_chunk_content(video_path, flatten=True):12 video = VideoFileClip(video_path)13print('video_duration:', video.duration)1415with tempfile.NamedTemporaryFile(suffix=".wav", delete=True)as temp_audio_file:16 temp_audio_file_path = temp_audio_file.name
17 video.audio.write_audiofile(temp_audio_file_path, codec="pcm_s16le", fps=16000)18 audio_np, sr = librosa.load(temp_audio_file_path, sr=16000, mono=True)19 num_units = math.ceil(video.duration)2021# 1 frame + 1s audio chunk22 contents=[]23for i inrange(num_units):24 frame = video.get_frame(i+1)25 image = Image.fromarray((frame).astype(np.uint8))26 audio = audio_np[sr*i:sr*(i+1)]27if flatten:28 contents.extend(["<unit>", image, audio])29else:30 contents.append(["<unit>", image, audio])3132return contents
333435model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,36 attn_implementation='sdpa', torch_dtype=torch.bfloat16)37model = model.eval().cuda()38tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)3940model.init_tts()4142# If you are using an older version of PyTorch, you might encounter this issue "weight_norm_fwd_first_dim_kernel" not implemented for 'BFloat16', Please convert the TTS to float32 type.43# model.tts.float()4445# https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/assets/Skiing.mp446video_path="assets/Skiing.mp4"47sys_msg = model.get_sys_prompt(mode='omni', language='en')48# if use voice clone prompt, please set ref_audio49# ref_audio_path = '/path/to/ref_audio'50# ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)51# sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode='omni', language='en')5253contents = get_video_chunk_content(video_path)54msg ={"role":"user","content": contents}55msgs =[sys_msg, msg]5657# please set generate_audio=True and output_audio_path to save the tts result58generate_audio =True59output_audio_path ='output.wav'6061res = model.chat(62 msgs=msgs,63 tokenizer=tokenizer,64 sampling=True,65 temperature=0.5,66 max_new_tokens=4096,67 omni_input=True,# please set omni_input=True when omni inference68 use_tts_template=True,69 generate_audio=generate_audio,70 output_audio_path=output_audio_path,71 max_slice_nums=1,72 use_image_id=False,73 return_dict=True74)75print(res)
点击查看多模态流式推理设置。
注意:流式推理存在轻微的性能下降,因为音频编码并非全局的。
python
1# a new conversation need reset session first, it will reset the kv-cache2model.reset_session()34contents = get_video_chunk_content(video_path, flatten=False)5session_id ='123'6generate_audio =True78# 1. prefill system prompt9res = model.streaming_prefill(10 session_id=session_id,11 msgs=[sys_msg],12 tokenizer=tokenizer
13)1415# 2. prefill video/audio chunks16for content in contents:17 msgs =[{"role":"user","content": content}]18 res = model.streaming_prefill(19 session_id=session_id,20 msgs=msgs,21 tokenizer=tokenizer
22)2324# 3. generate25res = model.streaming_generate(26 session_id=session_id,27 tokenizer=tokenizer,28 temperature=0.5,29 generate_audio=generate_audio
30)3132audios =[]33text =""3435if generate_audio:36for r in res:37 audio_wav = r.audio_wav
38 sampling_rate = r.sampling_rate
39 txt = r.text
4041 audios.append(audio_wav)42 text += txt
4344 res = np.concatenate(audios)45 sf.write("output.wav", res, samplerate=sampling_rate)46print("text:", text)47print("audio saved to output.wav")48else:49for r in res:50 text += r['text']51print("text:", text)
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
18
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
19
20
msgs
=
[
sys_prompt
,
user_question
]
21
res
=
model
.
chat
(
22
msgs
=
msgs
,
23
tokenizer
=
tokenizer
,
24
sampling
=
True
,
25
max_new_tokens
=
128
,
26
use_tts_template
=
True
,
27
generate_audio
=
True
,
28
temperature
=
0.3
,
29
output_audio_path
=
'result.wav'
,
30
)
31
32
"openbmb/MiniCPM-V-2_6"
6
# MODEL_NAME = "openbmb/MiniCPM-O-2_6"
7
# Also available for previous models
8
# MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
9
# MODEL_NAME = "HwwwH/MiniCPM-V-2"
10
11
image
=
Image
.
open
(
"xxx.png"
)
.
convert
(
"RGB"
)
12
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
,
trust_remote_code
=
True
)
13
llm
=
LLM
(
14
model
=
MODEL_NAME
,
15
trust_remote_code
=
True
,
16
gpu_memory_utilization
=
1
,
17
max_model_len
=
2048
18
)
19
20
messages
=
[
{
21
"role"
:
22
"user"
,
23
"content"
:
24
# Number of images
25
"(<image>./</image>)"
+
\
26
"\nWhat is the content of this image?"
27
}
]
28
prompt
=
tokenizer
.
apply_chat_template
(
29
messages
,
30
tokenize
=
False
,
31
add_generation_prompt
=
True
32
)
33
34
# Single Inference
35
inputs
=
{
36
"prompt"
:
prompt
,
37
"multi_modal_data"
:
{
38
"image"
:
image
39
# Multi images, the number of images should be equal to that of `(<image>./</image>)`