Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P76838
simple ovms+kserve model-server
Active
Public
Actions
Authored by
kevinbazira
on Jun 2 2025, 2:46 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Tags
Machine-Learning-Team
Referenced Files
F61255274: simple ovms+kserve model-server
Jun 2 2025, 2:46 PM
2025-06-02 14:46:30 (UTC+0)
Subscribers
None
import
numpy
as
np
import
os
import
json
from
transformers
import
AutoTokenizer
from
openvino.runtime
import
Core
MODEL_BASE
=
"/mnt/models/phi-4-mini-instruct-int8-ov/1"
# version folder
IR_XML
=
os
.
path
.
join
(
MODEL_BASE
,
"openvino_model.xml"
)
IR_BIN
=
os
.
path
.
join
(
MODEL_BASE
,
"openvino_model.bin"
)
class
SimpleOVMSModel
(
kserve
.
Model
):
def
__init__
(
self
,
name
:
str
):
super
()
.
__init__
(
name
)
self
.
name
=
name
self
.
tokenizer
=
None
self
.
core
=
None
self
.
compiled
=
None
self
.
input_ids_name
=
None
self
.
attn_name
=
None
self
.
pos_name
=
None
self
.
beam_name
=
None
self
.
logits_name
=
None
def
load
(
self
)
->
bool
:
"""
1. Load HuggingFace tokenizer from MODEL_BASE
2. Read and compile the IR with OpenVINO
3. Cache input/output layer names
"""
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_BASE
,
trust_remote_code
=
False
)
self
.
core
=
Core
()
model
=
self
.
core
.
read_model
(
model
=
IR_XML
,
weights
=
IR_BIN
)
self
.
compiled
=
self
.
core
.
compile_model
(
model
,
device_name
=
"CPU"
)
# Cache layer names (assumes standard naming: input_ids, attention_mask, position_ids, beam_idx)
inputs
=
list
(
self
.
compiled
.
inputs
)
self
.
input_ids_name
=
inputs
[
0
]
self
.
attn_name
=
inputs
[
1
]
self
.
pos_name
=
inputs
[
2
]
self
.
beam_name
=
inputs
[
3
]
self
.
logits_name
=
list
(
self
.
compiled
.
outputs
)[
0
]
self
.
ready
=
True
return
True
def
preprocess
(
self
,
payload
:
dict
,
headers
:
dict
=
None
)
->
dict
:
"""
Expect:
{ "instances": [ { "text": "some sentence" } ] }
Tokenize to NumPy arrays for input_ids, attention_mask, position_ids, beam_idx.
Return a dict of Python lists so that JSON is serializable.
"""
instances
=
payload
.
get
(
"instances"
,
[])
if
not
instances
or
"text"
not
in
instances
[
0
]:
raise
kserve
.
errors
.
InvalidInput
(
"Missing 'text' field in instances"
)
text
=
instances
[
0
][
"text"
]
enc
=
self
.
tokenizer
(
text
,
return_tensors
=
"np"
,
padding
=
True
,
truncation
=
True
)
input_ids
=
enc
[
"input_ids"
]
.
astype
(
np
.
int64
)
# shape (1, seq_len)
attention_mask
=
enc
[
"attention_mask"
]
.
astype
(
np
.
int64
)
# shape (1, seq_len)
batch_size
,
seq_len
=
input_ids
.
shape
position_ids
=
np
.
arange
(
seq_len
,
dtype
=
np
.
int64
)
.
reshape
(
1
,
-
1
)
beam_idx
=
np
.
zeros
((
batch_size
,),
dtype
=
np
.
int32
)
return
{
"input_ids"
:
input_ids
.
tolist
(),
"attention_mask"
:
attention_mask
.
tolist
(),
"position_ids"
:
position_ids
.
tolist
(),
"beam_idx"
:
beam_idx
.
tolist
()
}
def
predict
(
self
,
processed_inputs
:
dict
,
headers
:
dict
=
None
)
->
dict
:
"""
1. Convert the lists back to NumPy arrays
2. Call OpenVINO .infer_new_request(...)
3. Greedy argmax over logits → token IDs
4. Detokenize IDs → text
5. Return {"predictions":[{"text": generated_text}]}
"""
# Reconstruct NumPy arrays
input_ids
=
np
.
array
(
processed_inputs
[
"input_ids"
],
dtype
=
np
.
int64
)
attention_mask
=
np
.
array
(
processed_inputs
[
"attention_mask"
],
dtype
=
np
.
int64
)
position_ids
=
np
.
array
(
processed_inputs
[
"position_ids"
],
dtype
=
np
.
int64
)
beam_idx
=
np
.
array
(
processed_inputs
[
"beam_idx"
],
dtype
=
np
.
int32
)
infer_inputs
=
{
self
.
input_ids_name
:
input_ids
,
self
.
attn_name
:
attention_mask
,
self
.
pos_name
:
position_ids
,
self
.
beam_name
:
beam_idx
}
results
=
self
.
compiled
.
infer_new_request
(
infer_inputs
)
logits
=
results
[
self
.
logits_name
]
# shape: (1, seq_len, vocab_size)
# Greedy decode: pick argmax for each new token step
# (assumes this IR outputs full sequence of new tokens)
token_ids
=
np
.
argmax
(
logits
,
axis
=-
1
)
.
flatten
()
.
tolist
()
# Detokenize back to text
generated_text
=
self
.
tokenizer
.
decode
(
token_ids
,
skip_special_tokens
=
True
)
return
{
"predictions"
:
[
{
"text"
:
generated_text
}
]
}
if
__name__
==
"__main__"
:
model_name
=
os
.
environ
.
get
(
"MODEL_NAME"
,
"simple-ovms"
)
model
=
SimpleOVMSModel
(
model_name
)
model
.
load
()
kserve
.
ModelServer
()
.
start
([
model
])
Event Timeline
kevinbazira
created this paste.
Jun 2 2025, 2:46 PM
2025-06-02 14:46:30 (UTC+0)
kevinbazira
mentioned this in
T395012: Host an OpenVINO model in LiftWing
.
Jun 2 2025, 3:11 PM
2025-06-02 15:11:47 (UTC+0)
Log In to Comment