Page MenuHomePhabricator
Paste P76838

simple ovms+kserve model-server
ActivePublic

Authored by kevinbazira on Jun 2 2025, 2:46 PM.
Referenced Files
F61255274: simple ovms+kserve model-server
Jun 2 2025, 2:46 PM
Subscribers
None
import numpy as np
import os
import json
from transformers import AutoTokenizer
from openvino.runtime import Core
MODEL_BASE = "/mnt/models/phi-4-mini-instruct-int8-ov/1" # version folder
IR_XML = os.path.join(MODEL_BASE, "openvino_model.xml")
IR_BIN = os.path.join(MODEL_BASE, "openvino_model.bin")
class SimpleOVMSModel(kserve.Model):
def __init__(self, name: str):
super().__init__(name)
self.name = name
self.tokenizer = None
self.core = None
self.compiled = None
self.input_ids_name = None
self.attn_name = None
self.pos_name = None
self.beam_name = None
self.logits_name = None
def load(self) -> bool:
"""
1. Load HuggingFace tokenizer from MODEL_BASE
2. Read and compile the IR with OpenVINO
3. Cache input/output layer names
"""
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE, trust_remote_code=False)
self.core = Core()
model = self.core.read_model(model=IR_XML, weights=IR_BIN)
self.compiled = self.core.compile_model(model, device_name="CPU")
# Cache layer names (assumes standard naming: input_ids, attention_mask, position_ids, beam_idx)
inputs = list(self.compiled.inputs)
self.input_ids_name = inputs[0]
self.attn_name = inputs[1]
self.pos_name = inputs[2]
self.beam_name = inputs[3]
self.logits_name = list(self.compiled.outputs)[0]
self.ready = True
return True
def preprocess(self, payload: dict, headers: dict = None) -> dict:
"""
Expect:
{ "instances": [ { "text": "some sentence" } ] }
Tokenize to NumPy arrays for input_ids, attention_mask, position_ids, beam_idx.
Return a dict of Python lists so that JSON is serializable.
"""
instances = payload.get("instances", [])
if not instances or "text" not in instances[0]:
raise kserve.errors.InvalidInput("Missing 'text' field in instances")
text = instances[0]["text"]
enc = self.tokenizer(text, return_tensors="np", padding=True, truncation=True)
input_ids = enc["input_ids"].astype(np.int64) # shape (1, seq_len)
attention_mask = enc["attention_mask"].astype(np.int64) # shape (1, seq_len)
batch_size, seq_len = input_ids.shape
position_ids = np.arange(seq_len, dtype=np.int64).reshape(1, -1)
beam_idx = np.zeros((batch_size,), dtype=np.int32)
return {
"input_ids": input_ids.tolist(),
"attention_mask": attention_mask.tolist(),
"position_ids": position_ids.tolist(),
"beam_idx": beam_idx.tolist()
}
def predict(self, processed_inputs: dict, headers: dict = None) -> dict:
"""
1. Convert the lists back to NumPy arrays
2. Call OpenVINO .infer_new_request(...)
3. Greedy argmax over logits → token IDs
4. Detokenize IDs → text
5. Return {"predictions":[{"text": generated_text}]}
"""
# Reconstruct NumPy arrays
input_ids = np.array(processed_inputs["input_ids"], dtype=np.int64)
attention_mask = np.array(processed_inputs["attention_mask"], dtype=np.int64)
position_ids = np.array(processed_inputs["position_ids"], dtype=np.int64)
beam_idx = np.array(processed_inputs["beam_idx"], dtype=np.int32)
infer_inputs = {
self.input_ids_name: input_ids,
self.attn_name: attention_mask,
self.pos_name: position_ids,
self.beam_name: beam_idx
}
results = self.compiled.infer_new_request(infer_inputs)
logits = results[self.logits_name] # shape: (1, seq_len, vocab_size)
# Greedy decode: pick argmax for each new token step
# (assumes this IR outputs full sequence of new tokens)
token_ids = np.argmax(logits, axis=-1).flatten().tolist()
# Detokenize back to text
generated_text = self.tokenizer.decode(token_ids, skip_special_tokens=True)
return { "predictions": [ { "text": generated_text } ] }
if __name__ == "__main__":
model_name = os.environ.get("MODEL_NAME", "simple-ovms")
model = SimpleOVMSModel(model_name)
model.load()
kserve.ModelServer().start([model])