-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmultimode_llm.py
244 lines (190 loc) · 9.93 KB
/
multimode_llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import platform
import time
import sys
from PIL import Image
accel_mode = None
if sys.platform == 'darwin':
if "ARM64" in platform.uname().version:
accel_mode = 'MLX'
else:
accel_mode = 'ONNX'
if accel_mode == 'ONNX':
import onnxruntime_genai as og
elif accel_mode == 'MLX':
from phi_3_vision_mlx import generate, load
else:
from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
user_prompt_marker = '<|user|>\n'
image_marker = '<|image_1|>\n'
assistant_prompt_marker = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"
class MultiModeLLM:
def __init__(self, model_repo: str, filename: str, model_dir: str,
device: str, inference_library: str = None,
verbose: bool = True) -> None:
self.device = device
self.model = None
self.processor = None
self.model_path = None
try:
if accel_mode == 'ONNX': # Non macOS
# For ONNX, we download the models at install time
self.device = device
self.model_path = model_dir
self.model = og.Model(self.model_path)
self.processor = self.model.create_multimodal_processor()
self.tokenizer_stream = self.processor.create_stream()
elif accel_mode == 'MLX': # macOS, Apple Silicon.
# Hardcoded in MLX code
# repo = "microsoft/Phi-3-vision-128k-instruct"
self.device = device
self.model_path = model_dir
self.model, self.processor = load(model_path=model_dir, adapter_path=None)
else: # macOS, Numpy, not MLX
# For macOS (intel), we don't download at install time (yet). We download at runtime
# TBD: Download model in installer, load the model here. If download
# and load fail, fall through to download-at-runtime
raise
except Exception as ex:
# A general fall-through for the case where ONNX or MLX model loading failed, or where
# we only have non-GPU accelerated libraries (macOS on Intel) to use.
if accel_mode == 'ONNX' or accel_mode == 'MLX':
# We tried, but failed, and we won't fallback to CPU here (Could but won't).
self.model = None
self.processor = None
self.model_path = None
else:
# For macOS we only download the model at runtime (for now - this will change)
try:
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct
# This will download the model from the repo and cache it locally
# Handy if we didn't download during install
attn_implementation = "eager" if device == "cpu" else "sdpa" # eager = manual
# use_flash_attention_2 = False if device == "cpu" else True
use_flash_attention_2 = False # Only on Windows, only on CUDA
config = AutoConfig.from_pretrained(model_repo, trust_remote_code=True)
# config.gguf_file = ...
config.attn_implementation = attn_implementation
config.device_map = device
config.torch_dtype = 'auto'
# config.temperature = 0.1 - needs do_sample=True
config.trust_remote_code = True
config.use_flash_attention_2 = use_flash_attention_2
self.model = AutoModelForCausalLM.from_pretrained(model_repo,
attn_implementation=attn_implementation,
config=config,
trust_remote_code=True)
self.processor = AutoProcessor.from_pretrained(model_repo, trust_remote_code=True)
self.model_path = self.model.name_or_path
except Exception as ex_2:
self.model = None
self.processor = None
self.model_path = None
# get the relative path to the model file from the model itself
# self.model_path = os.path.relpath(self.llm.model_path)
def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
max_tokens: int = 1024, temperature: float = 0.4,
stream: bool = True) -> any:
"""
Generates a response from a chat / conversation prompt
params:
prompt:str The prompt to generate text from.
system_prompt: str=None The description of the assistant
max_tokens: int = 128 The maximum number of tokens to generate.
temperature: float = 0.8 The temperature to use for sampling.
"""
start_process_time = time.perf_counter()
stop_reason = None
if not system_prompt:
system_prompt = "You're a helpful assistant who answers questions the user asks of you concisely and accurately."
if image:
prompt = f"{user_prompt_marker}{image_marker}\n{user_prompt}{prompt_suffix}{assistant_prompt_marker}"
else:
prompt = f"{user_prompt_marker}\n{user_prompt}{prompt_suffix}{assistant_prompt_marker}"
inferenceMs = 0
try:
if accel_mode == 'ONNX':
# ONNX genai API doesn't (yet) provide the means to load an image
# from memory https://github.com/microsoft/onnxruntime-genai/issues/777
if image:
import os
temp_name="onnx_genai_temp_image.png"
image.save(temp_name, icc_profile=None)
og_image = og.Images.open(temp_name)
os.remove(temp_name)
else:
og_image = None
inputs = self.processor(prompt, images=og_image)
params = og.GeneratorParams(self.model)
params.set_inputs(inputs)
params.set_search_options(max_length=3072)
response = ""
generator = og.Generator(self.model, params)
# If we're streaming then short circuit here and just return the
# generator. NOTE: the caller will need to del the generator
if stream:
return (generator, self.tokenizer_stream, {
"success": True,
"reply": response,
"stop_reason": "None",
"processMs" : int((time.perf_counter() - start_process_time) * 1000),
"inferenceMs" : 0
})
while not generator.is_done():
generator.compute_logits()
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
new_response = self.tokenizer_stream.decode(new_token)
response += new_response
inferenceMs = int((time.perf_counter() - start_inference_time) * 1000)
del generator
elif accel_mode == 'MLX':
start_inference_time = time.perf_counter()
# Using phi_3_vision_mlx v0.0.2
# https://github.com/JosefAlbers/Phi-3-Vision-MLX/tree/v0.0.2-beta
response = generate(self.model, self.processor, prompt, [image])
# Using latest phi_3_vision_mlx
# agent = Agent()
# response = agent(prompt, images=[image])
# agent.end()
inferenceMs = int((time.perf_counter() - start_inference_time) * 1000)
else:
inputs = self.processor(prompt, image, return_tensors="pt").to(self.device)
start_inference_time = time.perf_counter()
generate_ids = self.model.generate(**inputs, max_new_tokens=max_tokens,
eos_token_id=self.processor.tokenizer.eos_token_id,) # note trailing ","
inferenceMs = int((time.perf_counter() - start_inference_time) * 1000)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = self.processor.batch_decode(generate_ids, skip_special_tokens=True,
clean_up_tokenization_spaces=False)[0]
except Exception as ex:
if accel_mode == 'ONNX':
return (None, None, {
"success": False,
"error": str(ex),
"stop_reason": "Exception",
"processMs": int((time.perf_counter() - start_process_time) * 1000),
"inferenceMs": 0
})
return {
"success": False,
"error": str(ex),
"stop_reason": "Exception",
"processMs": int((time.perf_counter() - start_process_time) * 1000),
"inferenceMs": 0
}
if not accel_mode == 'ONNX' and self.device == "cuda":
try:
import torch
torch.cuda.empty_cache()
except:
pass
if stop_reason is None:
stop_reason = "completed"
return {
"success": True,
"reply": response,
"stop_reason": stop_reason,
"processMs" : int((time.perf_counter() - start_process_time) * 1000),
"inferenceMs" : inferenceMs
}