Skip to content

Commit

Permalink
Merge pull request intel#195 from TNG/feature/openvino
Browse files Browse the repository at this point in the history
Add OpenVINO as a Backend to AIPG
  • Loading branch information
Nuullll authored Feb 24, 2025
2 parents 1c66317 + 670259f commit 2c35529
Show file tree
Hide file tree
Showing 53 changed files with 1,569 additions and 724 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ llama-cpp-env/
*-env/
build-envs/
portable-git/
llm_cache/*
2 changes: 1 addition & 1 deletion LlamaCPP/llama_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def error_callback(self, ex: Exception):
elif isinstance(ex, RuntimeError):
self.put_msg({"type": "error", "err_type": "runtime_error"})
else:
self.put_msg({"type": "error", "err_type": "unknow_exception"})
self.put_msg({"type": "error", "err_type": "unknown_exception"})
print(f"exception:{str(ex)}")

def text_conversation(self, params: LLMParams):
Expand Down
8 changes: 5 additions & 3 deletions LlamaCPP/llama_params.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from typing import Dict, List
from typing import Any, Dict, List

class LLMParams:
prompt: List[Dict[str, str]]
device: int
enable_rag: bool
model_repo_id: str
max_tokens: int
generation_parameters: Dict[str, Any]

def __init__(
self, prompt: list, device: int, enable_rag: bool, model_repo_id: str, max_tokens: int
self, prompt: list, device: int, enable_rag: bool, model_repo_id: str, max_tokens: int, **kwargs
) -> None:
self.prompt = prompt
self.device = device
self.enable_rag = enable_rag
self.model_repo_id = model_repo_id
self.max_tokens = max_tokens
self.max_tokens = max_tokens
self.generation_parameters = kwargs
16 changes: 16 additions & 0 deletions OpenVINO/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.vscode/
__pycache__/
models/llm/
temp/
test/
dist/
build/
cache/
test/
env/

!tools/*.exe
llm_cache/
TinyLlama-*
laion/
db/
169 changes: 169 additions & 0 deletions OpenVINO/openvino_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import threading
from queue import Empty, Queue
import json
import traceback
from typing import Dict, List, Callable
from openvino_interface import LLMInterface
from openvino_params import LLMParams

RAG_PROMPT_FORMAT = "Answer the questions based on the information below. \n{context}\n\nQuestion: {prompt}"

class LLM_SSE_Adapter:
msg_queue: Queue
finish: bool
singal: threading.Event
llm_interface: LLMInterface
should_stop: bool

def __init__(self, llm_interface: LLMInterface):
self.msg_queue = Queue(-1)
self.finish = False
self.singal = threading.Event()
self.llm_interface = llm_interface
self.should_stop = False

def put_msg(self, data):
self.msg_queue.put_nowait(data)
self.singal.set()

def load_model_callback(self, event: str):
data = {"type": "load_model", "event": event}
self.put_msg(data)

def text_in_callback(self, msg: str):
data = {"type": "text_in", "value": msg}
self.put_msg(data)

def text_out_callback(self, msg: str, type=1):
data = {"type": "text_out", "value": msg, "dtype": type}
self.put_msg(data)

def first_latency_callback(self, first_latency: str):
data = {"type": "first_token_latency", "value": first_latency}
self.put_msg(data)

def after_latency_callback(self, after_latency: str):
data = {"type": "after_token_latency", "value": after_latency}
self.put_msg(data)

def sr_latency_callback(self, sr_latency: str):
data = {"type": "sr_latency", "value": sr_latency}
self.put_msg(data)

def error_callback(self, ex: Exception):
if (
isinstance(ex, NotImplementedError)
and ex.__str__() == "Access to repositories lists is not implemented."
):
self.put_msg(
{
"type": "error",
"err_type": "repositories_not_found",
}
)
# elif isinstance(ex, NotEnoughDiskSpaceException):
# self.put_msg(
# {
# "type": "error",
# "err_type": "not_enough_disk_space",
# "need": bytes2human(ex.requires_space),
# "free": bytes2human(ex.free_space),
# }
# )
# elif isinstance(ex, DownloadException):
# self.put_msg({"type": "error", "err_type": "download_exception"})
# # elif isinstance(ex, llm_biz.StopGenerateException):
# # pass
elif isinstance(ex, RuntimeError):
self.put_msg({"type": "error", "err_type": "runtime_error"})
else:
self.put_msg({"type": "error", "err_type": "unknown_exception"})
self.put_msg(f"exception:{str(ex)}")

def text_conversation(self, params: LLMParams):
thread = threading.Thread(
target=self.text_conversation_run,
args=[params],
)
thread.start()
return self.generator()


def stream_function(self, output):
self.text_out_callback(output)
if self.llm_interface.stop_generate:
self.put_msg("Stopping generation.")
return True # Stop generation

return False


def text_conversation_run(
self,
params: LLMParams,
):
try:
self.llm_interface.load_model(params, callback=self.load_model_callback)

prompt = params.prompt
full_prompt = convert_prompt(prompt)
self.llm_interface.create_chat_completion(full_prompt, self.stream_function, params.max_tokens)

except Exception as ex:
traceback.print_exc()
self.error_callback(ex)
finally:
self.llm_interface.stop_generate = False
self.finish = True
self.singal.set()

def generator(self):
while True:
while not self.msg_queue.empty():
try:
data = self.msg_queue.get_nowait()
msg = f"data:{json.dumps(data)}\0"
print(msg)
yield msg
except Empty(Exception):
break
if not self.finish:
self.singal.clear()
self.singal.wait()
else:
break


_default_prompt = {
"role": "system",
"content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. Please keep the output text language the same as the user input.",
}

def convert_prompt(prompt: List[Dict[str, str]]):
chat_history = [_default_prompt]
prompt_len = prompt.__len__()
i = 0
while i < prompt_len:
chat_history.append({"role": "user", "content": prompt[i].get("question")})
if i < prompt_len - 1:
chat_history.append(
{"role": "assistant", "content": prompt[i].get("answer")}
)
i = i + 1
return chat_history


def process_rag(
prompt: str,
device: str,
text_out_callback: Callable[[str, int], None] = None,
):
import rag
rag.to(device)
query_success, context, rag_source = rag.query(prompt)
if query_success:
print("rag query input\r\n{}output:\r\n{}".format(prompt, context))
prompt = RAG_PROMPT_FORMAT.format(prompt=prompt, context=context)
if text_out_callback is not None:
text_out_callback(rag_source, 2)
return prompt
52 changes: 52 additions & 0 deletions OpenVINO/openvino_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Dict, List, Callable
from os import path
from openvino_interface import LLMInterface
import openvino_genai
from openvino_params import LLMParams
import openvino_model_config as model_config
import gc

class OpenVino(LLMInterface):
def __init__(self):
self._model = None
self.stop_generate = False
self._last_repo_id = None

def load_model(self, params: LLMParams, callback: Callable[[str], None] = None):
model_repo_id = params.model_repo_id
if self._model is None or self._last_repo_id != model_repo_id:
if callback is not None:
callback("start")
self.unload_model()
callback(params.model_repo_id)

model_base_path = model_config.openVINOConfig.get("openvinoLLM")
model_name = model_repo_id.replace("/", "---")
model_path = path.abspath(path.join(model_base_path, model_name))

enable_compile_cache = dict()
enable_compile_cache["CACHE_DIR"] = "llm_cache"
self._model = openvino_genai.LLMPipeline(model_path, "GPU", **enable_compile_cache)
self._tokenizer = self._model.get_tokenizer()

self._last_repo_id = model_repo_id
if callback is not None:
callback("finish")


def create_chat_completion(self, messages: List[Dict[str, str]], streamer: Callable[[str], None], max_tokens: int = 1024):
config = openvino_genai.GenerationConfig()
config.max_new_tokens = max_tokens

full_prompt = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True)
return self._model.generate(full_prompt, config, streamer)


def unload_model(self):
if self._model is not None:
del self._model
gc.collect()
self._model = None

def get_backend_type(self):
return "openvino"
24 changes: 24 additions & 0 deletions OpenVINO/openvino_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
from openvino_params import LLMParams

class LLMInterface(ABC):
stop_generate: bool
_model: Optional[object]

@abstractmethod
def load_model(self, params: LLMParams, **kwargs):
pass

@abstractmethod
def unload_model(self):
pass

@abstractmethod
def create_chat_completion(self, messages: List[Dict[str, str]]):
pass

@abstractmethod
def get_backend_type(self):
pass

4 changes: 4 additions & 0 deletions OpenVINO/openvino_model_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
openVINOConfig = {
"openvinoLLM": "../service/models/llm/openvino",
}

19 changes: 19 additions & 0 deletions OpenVINO/openvino_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Any, Dict, List

class LLMParams:
prompt: List[Dict[str, str]]
device: int
enable_rag: bool
model_repo_id: str
max_tokens: int
generation_parameters: Dict[str, Any]

def __init__(
self, prompt: list, device: int, enable_rag: bool, model_repo_id: str, max_tokens: int, **kwargs
) -> None:
self.prompt = prompt
self.device = device
self.enable_rag = enable_rag
self.model_repo_id = model_repo_id
self.max_tokens = max_tokens
self.generation_parameters = kwargs
Loading

0 comments on commit 2c35529

Please sign in to comment.