opengpt allows you to run your large-scale multi-modal models on any cloud, with a single command.
rungpt allows you to run your large-scale multi-modal models on any cloud, with a single command.
opengpt is open-source, self-hosted, and supports all major cloud providers,
+
rungpt is open-source, self-hosted, and supports all major cloud providers,
including AWS, GCP, and Azure.
@@ -158,13 +158,13 @@ Open-source and self-hosted
d="M12 1.5A2.5 2.5 0 0 1 14.5 4 2.5 2.5 0 0 1 12 6.5 2.5 2.5 0 0 1 9.5 4 2.5 2.5 0 0 1 12 1.5M15.87 5C18 5 20 7 20 9c2.7 0 2.7 4 0 4H4c-2.7 0-2.7-4 0-4 0-2 2-4 4.13-4 .44 1.73 2.01 3 3.87 3 1.86 0 3.43-1.27 3.87-3M5 15h3l1 7H7l-2-7m5 0h4l-1 7h-2l-1-7m6 0h3l-2 7h-2l1-7Z">
- Getting started with opengpt's open source tool is
+ Getting started with rungpt's open source tool is
just a mater of:
$
- pip install "open_gpt_torch"
+ pip install "run_gpt_torch"
diff --git a/docs/overrides/main.html b/docs/overrides/main.html
index 64b22a1..43720f0 100644
--- a/docs/overrides/main.html
+++ b/docs/overrides/main.html
@@ -1,9 +1,9 @@
{% extends "base.html" %}
{% block announce %}
-Like opengpt? Give us a
on
+ src="{{ 'assets/images/github-logo.png' | url }}"/>
GitHub!
{% endblock %}
diff --git a/examples/chat_example.py b/examples/chat_example.py
index cb8b335..b919074 100644
--- a/examples/chat_example.py
+++ b/examples/chat_example.py
@@ -1,17 +1,21 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
is_step = False
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'nthngdy/pythia-owt2-70m-100k', precision='fp16', device_map='balanced'
)
-message = [{"role": "user", "content": "What is your name?"},
- {"role": "assistant", "content": "My name is Pythia."},
- {"role": "user", "content": 'Hello Pythia, can you tell me that is the goal of life?'}
- ]
+message = [
+ {"role": "user", "content": "What is your name?"},
+ {"role": "assistant", "content": "My name is Pythia."},
+ {
+ "role": "user",
+ "content": 'Hello Pythia, can you tell me that is the goal of life?',
+ },
+]
if not is_step:
generated_text = model.chat(
@@ -26,4 +30,4 @@
_['past_key_values'] = None
print(_)
end_measures = end_measure(start_measures)
-log_measures(end_measures, "Model generation")
\ No newline at end of file
+log_measures(end_measures, "Model generation")
diff --git a/examples/codegen_example.py b/examples/codegen_example.py
index ec55a12..3f1d885 100644
--- a/examples/codegen_example.py
+++ b/examples/codegen_example.py
@@ -1,8 +1,8 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'Salesforce/codegen-350M-mono', precision='fp16', device_map='balanced'
)
diff --git a/examples/context_example.py b/examples/context_example.py
index 952ff46..2c1b747 100644
--- a/examples/context_example.py
+++ b/examples/context_example.py
@@ -1,9 +1,9 @@
-import open_gpt
-from open_gpt.models.session import SessionManager
+import run_gpt
+from run_gpt.models.session import SessionManager
session_manager = SessionManager()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'decapoda-research/llama-7b-hf', precision='fp16', device_map='balanced'
)
diff --git a/examples/flamingo_example.py b/examples/flamingo_example.py
index 83bcf12..ec07608 100644
--- a/examples/flamingo_example.py
+++ b/examples/flamingo_example.py
@@ -4,8 +4,8 @@
import torch
from PIL import Image
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
demo_image_one = Image.open(
requests.get(
@@ -26,7 +26,7 @@
)
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'openflamingo/OpenFlamingo-9B', precision='fp16', device='cuda', device_map=None
)
diff --git a/examples/llama_example.py b/examples/llama_example.py
index 1c6e7f2..76b53e4 100644
--- a/examples/llama_example.py
+++ b/examples/llama_example.py
@@ -1,10 +1,10 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
is_step = True
# start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'decapoda-research/llama-7b-hf', precision='fp16', device_map='balanced'
)
diff --git a/examples/lora_example.py b/examples/lora_example.py
index 7e91bc1..482cc31 100644
--- a/examples/lora_example.py
+++ b/examples/lora_example.py
@@ -1,10 +1,10 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
start_measures = start_measure()
# llama + LoRA
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'yahma/llama-7b-hf',
precision='fp16',
adapter_name_or_path='jinaai/llama-code',
diff --git a/examples/moss_example.py b/examples/moss_example.py
index cd24905..638fb19 100644
--- a/examples/moss_example.py
+++ b/examples/moss_example.py
@@ -1,14 +1,14 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
model_name='fnlp/moss-moon-003-sft',
precision='fp16',
device_map='balanced',
)
-# model = open_gpt.create_model(
+# model = run_gpt.create_model(
# model_name='fnlp/moss-moon-003-sft',
# precision='bit8',
# device_map='balanced',
diff --git a/examples/pythia_example.py b/examples/pythia_example.py
index 39526cb..7d06560 100644
--- a/examples/pythia_example.py
+++ b/examples/pythia_example.py
@@ -1,12 +1,12 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'EleutherAI/pythia-12b-deduped', precision='fp16', device_map='balanced'
)
-# model = open_gpt.create_model(
+# model = run_gpt.create_model(
# 'EleutherAI/pythia-12b-deduped', precision='bit4', device_map='balanced'
# )
diff --git a/examples/rwkv_example.py b/examples/rwkv_example.py
index bffd65d..e039c2a 100644
--- a/examples/rwkv_example.py
+++ b/examples/rwkv_example.py
@@ -1,10 +1,10 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
prompt = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'ybelkada/rwkv-raven-1b5',
precision='fp16',
device_map='balanced'
diff --git a/examples/stablelm_example.py b/examples/stablelm_example.py
index 6f82ac3..38b5fbc 100644
--- a/examples/stablelm_example.py
+++ b/examples/stablelm_example.py
@@ -1,5 +1,5 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
@@ -11,12 +11,12 @@
prompt = f"{system_prompt}<|USER|>What's your mood today?<|ASSISTANT|>"
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
model_name='stabilityai/stablelm-tuned-alpha-7b',
precision='fp16',
device_map='balanced',
)
-# model = open_gpt.create_model(
+# model = run_gpt.create_model(
# model_name='stabilityai/stablelm-tuned-alpha-7b',
# precision='bit4',
# device_map='balanced',
diff --git a/examples/vicuna_example.py b/examples/vicuna_example.py
index a883370..8b53621 100644
--- a/examples/vicuna_example.py
+++ b/examples/vicuna_example.py
@@ -1,11 +1,11 @@
-import open_gpt
-from open_gpt.profile import end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import end_measure, log_measures, start_measure
start_measures = start_measure()
-model = open_gpt.create_model(
+model = run_gpt.create_model(
'lmsys/vicuna-7b-delta-v1.1', precision='fp16', device_map='balanced'
)
-# model = open_gpt.create_model(
+# model = run_gpt.create_model(
# 'CarperAI/stable-vicuna-13b-delta', precision='fp16', device_map='balanced'
# )
diff --git a/mkdocs.yml b/mkdocs.yml
index d5d339b..0313291 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,13 +1,13 @@
# Project information
-site_name: OpenGPT
-# site_url: https://opengpt.dev.jina.ai
+site_name: RunGPT
+# site_url: https://rungpt.dev.jina.ai
site_author: jina-ai GmbH
site_description: >-
Easily and cost-effectively LLM deployments on any cloud.
# Repository
-repo_url: https://github.com/jina-ai/opengpt
-repo_name: jina-ai/opengpt
+repo_url: https://github.com/jina-ai/rungpt
+repo_name: jina-ai/rungpt
edit_uri: edit/master/docs/
#Copyright
@@ -111,11 +111,11 @@ extra:
# property: G-XXXXXX
social:
- icon: /fontawesome/brands/github
- link: https://github.com/jina-ai/opengpt
+ link: https://github.com/jina-ai/rungpt
- icon: /fontawesome/brands/python
- link: https://pypi.org/project/open-gpt-torch
+ link: https://pypi.org/project/run-gpt-torch
# - icon: /fontawesome/brands/docker
-# link: https://hub.docker.com/r/jina-ai/opengpt
+# link: https://hub.docker.com/r/jina-ai/rungpt
- icon: /fontawesome/brands/discord
link: https://discord.gg/yaNVaaGSXZ
- icon: /fontawesome/brands/twitter
@@ -135,8 +135,8 @@ nav:
- Kubernetes: docs/deployment/clouds.md
- Reference:
- CLI:
- - opengpt serve: docs/reference/cli/start.md
- - opengpt deploy: docs/reference/cli/start.md
- - Examples: https://github.com/jina-ai/opengpt/blob/main/examples
+ - rungpt serve: docs/reference/cli/start.md
+ - rungpt deploy: docs/reference/cli/start.md
+ - Examples: https://github.com/jina-ai/rungpt/blob/main/examples
- Discord: https://discord.gg/yaNVaaGSXZ
- Twitter: https://twitter.com/JinaAI_
diff --git a/open_gpt/cli/commands/about.py b/open_gpt/cli/commands/about.py
deleted file mode 100644
index 4eb226b..0000000
--- a/open_gpt/cli/commands/about.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from cleo.commands.command import Command
-
-
-class AboutCommand(Command):
- name = "about"
-
- description = "Shows information about OpenGPT."
-
- def handle(self) -> int:
- from open_gpt import __version__
-
- self.line(
- f"""\
-OpenGPT - An open-source cloud-native model serving framework.
-
-Version: {__version__}
-
-OpenGPT is a open-source cloud-native model serving framework\
- and libraries.
-See https://github.com/jina-ai/opengpt> for more information.\
-"""
- )
-
- return 0
diff --git a/open_gpt/models/chat.py b/open_gpt/models/chat.py
deleted file mode 100644
index 83ea877..0000000
--- a/open_gpt/models/chat.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import torch
-
-from typing import Optional, List
-
-MAX_LENGTH = 2048
-
-
-class ChatMixin:
- """Mixin for chat methods."""
-
- model: 'AutoModelForCausalLM'
- tokenizer: 'AutoTokenizer'
-
- @torch.inference_mode()
- def chat(self, messages: List[dict],
- max_new_tokens: Optional[int] = None,
- num_beams: int = 1,
- do_sample: bool = False,
- temperature: float = 1.0,
- top_k: int = 1,
- top_p: float = 0.9,
- repetition_penalty: float = 1.0,
- length_penalty: float = 1.0,
- no_repeat_ngram_size: int = 0,
- echo: bool = False,
- **kwargs):
- """Generate text from the given prompt.
-
- :param messages: A list of messages comprising the conversation so far.
- :param max_new_tokens: The maximum number of tokens to generate, not including the prompt.
- :param num_beams: Number of beams for beam search. 1 means no beam search.
- :param do_sample: Whether to use sampling instead of greedy decoding.
- :param temperature: The temperature to use for sampling. Only relevant if do_sample is True. Higher means more stochastic.
- :param top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Only relevant if do_sample is True.
- :param top_p: The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Only relevant if do_sample is True.
- :param repetition_penalty: The parameter for repetition penalty. 1.0 means no penalty.
- :param length_penalty: Exponential penalty to the length that is used with beam-based generation.
- It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence.
- Since the score is the log likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences,
- while length_penalty < 0.0 encourages shorter sequences.
- :param no_repeat_ngram_size: If set to int > 0, all ngrams of that size can only occur once.
- :param echo: Whether to echo the prompt in the generated text.
- """
-
- # normalize input
- prompt = self.create_prompt_for_chat(messages)
- completion_response = self.generate(prompt=prompt, max_new_tokens=max_new_tokens, num_beams=num_beams,
- do_sample=do_sample, temperature=temperature,
- top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty,
- length_penalty=length_penalty,
- no_repeat_ngram_size=no_repeat_ngram_size, echo=echo, **kwargs)
- # normalize output
- choices = completion_response.pop('choices')
- return {'choices': [{'index': 0, 'message': {'role': 'assistant',
- 'content': choices[0]['text']},
- 'finish_reason': choices[0]['finish_reason']}
- ],
- **completion_response}
-
- @torch.inference_mode()
- def step_chat(self, messages: Optional[List[dict]] = None, input_ids: Optional[List[int]] = None, **kwargs):
- if messages is None and input_ids is None:
- raise ValueError("Either messages or input_ids must be provided.")
- if messages and input_ids:
- raise ValueError("Only one of messages or input_ids can be provided.")
-
- if messages:
- # normalize input
- prompt = self.create_prompt_for_chat(messages)
- completion_response = self.step_generate(prompt=prompt, **kwargs)
- else:
- completion_response = self.step_generate(input_ids=input_ids, **kwargs)
- # normalize output
- for response in completion_response:
- choices = response.pop('choices')
- yield {'choices': [{'index': 0, 'message': {'role': 'assistant',
- 'content': choices[0]['text']},
- 'finish_reason': choices[0]['finish_reason']}
- ],
- **response}
diff --git a/open_gpt/spqr/quantizeargs.py b/open_gpt/spqr/quantizeargs.py
deleted file mode 100644
index 37281d8..0000000
--- a/open_gpt/spqr/quantizeargs.py
+++ /dev/null
@@ -1,58 +0,0 @@
-class QuantizeArgs():
- def __init__(self,
- model_name,
- model_path,
- dataset: str = 'wikitext2',
- load_from_saved: str = None,
- seed: int = 0,
- nsamples: int = 128,
- percdamp: float = 0.01,
- wbits: int = 4,
- groupsize: int = 16,
- permutation_order: str = "identity",
- true_sequential: bool = False,
- new_eval: bool = False,
- sym: bool = False,
- perchannel: bool = True,
- qq_scale_bits: int = 3,
- round_zero: int = None,
- qq_zero_bits: int = 3,
- qq_zero_sym: bool = False,
- qq_groupsize: int = 16,
- outlier_threshold: float = 0.2,
- simplified_outliers: bool = False,
- save: str = '',
- save_safetensors: str = '',
- benchmark: int = 0,
- check: bool = False,
- skip_out_loss: bool = False,
- offload_activations: bool = False,
- dtype: str = "auto"):
- self.model_name= model_name
- self.model_path = model_path
- self.dataset = dataset
- self.load_from_saved = load_from_saved
- self.seed = seed
- self.nsamples = nsamples
- self.percdamp = percdamp
- self.wbits = wbits
- self.groupsize = groupsize
- self.permutation_order = permutation_order
- self.true_sequential = true_sequential
- self.new_eval = new_eval
- self.sym = sym
- self.perchannel = perchannel
- self.qq_scale_bits = qq_scale_bits
- self.round_zero = round_zero
- self.qq_zero_bits = qq_zero_bits
- self.qq_zero_sym = qq_zero_sym
- self.qq_groupsize = qq_groupsize
- self.outlier_threshold = outlier_threshold
- self.simplified_outliers = simplified_outliers
- self.save = save
- self.save_safetensors = save_safetensors
- self.benchmark = benchmark
- self.check = check
- self.skip_out_loss = skip_out_loss
- self.offload_activations = offload_activations
- self.dtype = dtype
\ No newline at end of file
diff --git a/open_gpt/__init__.py b/run_gpt/__init__.py
similarity index 100%
rename from open_gpt/__init__.py
rename to run_gpt/__init__.py
diff --git a/open_gpt/__main__.py b/run_gpt/__main__.py
similarity index 57%
rename from open_gpt/__main__.py
rename to run_gpt/__main__.py
index 07fb413..5379a97 100644
--- a/open_gpt/__main__.py
+++ b/run_gpt/__main__.py
@@ -1,6 +1,6 @@
import sys
if __name__ == "__main__":
- from open_gpt.cli.application import main
+ from run_gpt.cli.application import main
sys.exit(main())
diff --git a/open_gpt/cli/__init__.py b/run_gpt/cli/__init__.py
similarity index 100%
rename from open_gpt/cli/__init__.py
rename to run_gpt/cli/__init__.py
diff --git a/open_gpt/cli/application.py b/run_gpt/cli/application.py
similarity index 80%
rename from open_gpt/cli/application.py
rename to run_gpt/cli/application.py
index 75513a3..4061b79 100644
--- a/open_gpt/cli/application.py
+++ b/run_gpt/cli/application.py
@@ -4,14 +4,14 @@
from cleo.application import Application as BaseApplication
from cleo.commands.command import Command
-from open_gpt import __version__
-from open_gpt.cli.command_loader import CommandLoader
+from run_gpt import __version__
+from run_gpt.cli.command_loader import CommandLoader
def load_command(name: str) -> Callable[[], Command]:
def _load() -> Command:
words = name.split(" ")
- module = import_module("open_gpt.cli.commands." + ".".join(words))
+ module = import_module("run_gpt.cli.commands." + ".".join(words))
command_class = getattr(module, "".join(c.title() for c in words) + "Command")
command: Command = command_class()
return command
@@ -30,7 +30,7 @@ def _load() -> Command:
class Application(BaseApplication):
def __init__(self) -> None:
- super().__init__("opengpt", __version__)
+ super().__init__("rungpt", __version__)
command_loader = CommandLoader({name: load_command(name) for name in COMMANDS})
self.set_command_loader(command_loader)
diff --git a/open_gpt/cli/command_loader.py b/run_gpt/cli/command_loader.py
similarity index 100%
rename from open_gpt/cli/command_loader.py
rename to run_gpt/cli/command_loader.py
diff --git a/open_gpt/cli/commands/__init__.py b/run_gpt/cli/commands/__init__.py
similarity index 100%
rename from open_gpt/cli/commands/__init__.py
rename to run_gpt/cli/commands/__init__.py
diff --git a/run_gpt/cli/commands/about.py b/run_gpt/cli/commands/about.py
new file mode 100644
index 0000000..2d7081e
--- /dev/null
+++ b/run_gpt/cli/commands/about.py
@@ -0,0 +1,24 @@
+from cleo.commands.command import Command
+
+
+class AboutCommand(Command):
+ name = "about"
+
+ description = "Shows information about RunGPT."
+
+ def handle(self) -> int:
+ from open_gpt import __version__
+
+ self.line(
+ f"""\
+RunGPT - An open-source cloud-native model serving framework.
+
+Version: {__version__}
+
+RunGPT is a open-source cloud-native model serving framework\
+ and libraries.
+See https://github.com/jina-ai/RunGPT> for more information.\
+"""
+ )
+
+ return 0
diff --git a/open_gpt/cli/commands/deploy.py b/run_gpt/cli/commands/deploy.py
similarity index 92%
rename from open_gpt/cli/commands/deploy.py
rename to run_gpt/cli/commands/deploy.py
index 56c583f..7c0e823 100644
--- a/open_gpt/cli/commands/deploy.py
+++ b/run_gpt/cli/commands/deploy.py
@@ -54,13 +54,13 @@ class DeployCommand(Command):
To start a model deploying, you can run:
- opengpt deploy stabilityai/stablelm-tuned-alpha-3b"""
+ rungpt deploy stabilityai/stablelm-tuned-alpha-3b"""
def handle(self) -> int:
if self.option('cloud') == 'jina':
- from open_gpt.factory import create_flow
- from open_gpt.helper import asyncify
- from open_gpt.serve.flow import deploy
+ from run_gpt.factory import create_flow
+ from run_gpt.helper import asyncify
+ from run_gpt.serve.flow import deploy
if self.option('config') is None:
flow_yaml = create_flow(
diff --git a/open_gpt/cli/commands/playground.py b/run_gpt/cli/commands/playground.py
similarity index 85%
rename from open_gpt/cli/commands/playground.py
rename to run_gpt/cli/commands/playground.py
index dab6fa6..5a07708 100644
--- a/open_gpt/cli/commands/playground.py
+++ b/run_gpt/cli/commands/playground.py
@@ -19,10 +19,10 @@ class PlaygroundCommand(Command):
To start a online playground for a model, you can run:
- opengpt playground facebook/llama-7b"""
+ rungpt playground facebook/llama-7b"""
def handle(self) -> int:
- from open_gpt.serve.playground.gradio import create_playground
+ from run_gpt.serve.playground.gradio import create_playground
playground = create_playground()
playground.queue(
diff --git a/open_gpt/cli/commands/quantize.py b/run_gpt/cli/commands/quantize.py
similarity index 70%
rename from open_gpt/cli/commands/quantize.py
rename to run_gpt/cli/commands/quantize.py
index 11cf1d9..4256c76 100644
--- a/open_gpt/cli/commands/quantize.py
+++ b/run_gpt/cli/commands/quantize.py
@@ -11,15 +11,12 @@ class QuantizeCommand(Command):
options = [
option(
- 'target',
- None,
- 'The path to quantized checkpoint.',
- flag=False,
- default=''
+ 'target', None, 'The path to quantized checkpoint.', flag=False, default=''
)
]
def handle(self) -> int:
- from open_gpt.spqr import quantize
+ from run_gpt.spqr import quantize
+
_, _ = quantize(self.argument('model_name'), self.option('target'))
return 0
diff --git a/open_gpt/cli/commands/serve.py b/run_gpt/cli/commands/serve.py
similarity index 94%
rename from open_gpt/cli/commands/serve.py
rename to run_gpt/cli/commands/serve.py
index edb8ea3..3a847cd 100644
--- a/open_gpt/cli/commands/serve.py
+++ b/run_gpt/cli/commands/serve.py
@@ -51,10 +51,10 @@ class ServeCommand(Command):
To start a model serving locally, you can run:
- opengpt serve stabilityai/stablelm-tuned-alpha-3b"""
+ rungpt serve stabilityai/stablelm-tuned-alpha-3b"""
def handle(self) -> int:
- from open_gpt.factory import create_flow
+ from run_gpt.factory import create_flow
with create_flow(
model_name_or_path=self.argument('model_name'),
diff --git a/open_gpt/factory.py b/run_gpt/factory.py
similarity index 95%
rename from open_gpt/factory.py
rename to run_gpt/factory.py
index 33ed021..71ee8ec 100644
--- a/open_gpt/factory.py
+++ b/run_gpt/factory.py
@@ -157,8 +157,8 @@ def create_flow(
):
from jina import Flow
- from open_gpt import __jina_version__, __version__
- from open_gpt.serve.flow import get_template
+ from run_gpt import __jina_version__, __version__
+ from run_gpt.serve.flow import get_template
# normalize the model name to be used as flow executor name
norm_name = model_name_or_path.split('/')[-1]
@@ -182,14 +182,14 @@ def create_flow(
'gateway_params': {'cors': cors},
'jina_version': __jina_version__,
'replicas': replicas,
- 'labels': {'app': 'open_gpt', 'version': __VERSION_TAG__},
+ 'labels': {'app': 'run_gpt', 'version': __VERSION_TAG__},
}
yaml = get_template('flow.yml.jinja2').render(
dockerized=dockerized,
- gateway_image=f'docker://jinaai/open_gpt_gateway:{__VERSION_TAG__}',
+ gateway_image=f'docker://jinaai/run_gpt_gateway:{__VERSION_TAG__}',
gateway_module='Gateway',
- executor_image=f'docker://jinaai/open_gpt_executor:{__VERSION_TAG__}',
+ executor_image=f'docker://jinaai/run_gpt_executor:{__VERSION_TAG__}',
executor_module='CausualLMExecutor'
if 'flamingo' not in model_name_or_path
else 'FlamingoExecutor',
diff --git a/open_gpt/helper.py b/run_gpt/helper.py
similarity index 100%
rename from open_gpt/helper.py
rename to run_gpt/helper.py
diff --git a/open_gpt/logs.py b/run_gpt/logs.py
similarity index 100%
rename from open_gpt/logs.py
rename to run_gpt/logs.py
diff --git a/open_gpt/models/__init__.py b/run_gpt/models/__init__.py
similarity index 100%
rename from open_gpt/models/__init__.py
rename to run_gpt/models/__init__.py
diff --git a/run_gpt/models/chat.py b/run_gpt/models/chat.py
new file mode 100644
index 0000000..746d67b
--- /dev/null
+++ b/run_gpt/models/chat.py
@@ -0,0 +1,107 @@
+from typing import List, Optional
+
+import torch
+
+MAX_LENGTH = 2048
+
+
+class ChatMixin:
+ """Mixin for chat methods."""
+
+ model: 'AutoModelForCausalLM'
+ tokenizer: 'AutoTokenizer'
+
+ @torch.inference_mode()
+ def chat(
+ self,
+ messages: List[dict],
+ max_new_tokens: Optional[int] = None,
+ num_beams: int = 1,
+ do_sample: bool = False,
+ temperature: float = 1.0,
+ top_k: int = 1,
+ top_p: float = 0.9,
+ repetition_penalty: float = 1.0,
+ length_penalty: float = 1.0,
+ no_repeat_ngram_size: int = 0,
+ echo: bool = False,
+ **kwargs
+ ):
+ """Generate text from the given prompt.
+
+ :param messages: A list of messages comprising the conversation so far.
+ :param max_new_tokens: The maximum number of tokens to generate, not including the prompt.
+ :param num_beams: Number of beams for beam search. 1 means no beam search.
+ :param do_sample: Whether to use sampling instead of greedy decoding.
+ :param temperature: The temperature to use for sampling. Only relevant if do_sample is True. Higher means more stochastic.
+ :param top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Only relevant if do_sample is True.
+ :param top_p: The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Only relevant if do_sample is True.
+ :param repetition_penalty: The parameter for repetition penalty. 1.0 means no penalty.
+ :param length_penalty: Exponential penalty to the length that is used with beam-based generation.
+ It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence.
+ Since the score is the log likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences,
+ while length_penalty < 0.0 encourages shorter sequences.
+ :param no_repeat_ngram_size: If set to int > 0, all ngrams of that size can only occur once.
+ :param echo: Whether to echo the prompt in the generated text.
+ """
+
+ # normalize input
+ prompt = self.create_prompt_for_chat(messages)
+ completion_response = self.generate(
+ prompt=prompt,
+ max_new_tokens=max_new_tokens,
+ num_beams=num_beams,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_k=top_k,
+ top_p=top_p,
+ repetition_penalty=repetition_penalty,
+ length_penalty=length_penalty,
+ no_repeat_ngram_size=no_repeat_ngram_size,
+ echo=echo,
+ **kwargs
+ )
+ # normalize output
+ choices = completion_response.pop('choices')
+ return {
+ 'choices': [
+ {
+ 'index': 0,
+ 'message': {'role': 'assistant', 'content': choices[0]['text']},
+ 'finish_reason': choices[0]['finish_reason'],
+ }
+ ],
+ **completion_response,
+ }
+
+ @torch.inference_mode()
+ def step_chat(
+ self,
+ messages: Optional[List[dict]] = None,
+ input_ids: Optional[List[int]] = None,
+ **kwargs
+ ):
+ if messages is None and input_ids is None:
+ raise ValueError("Either messages or input_ids must be provided.")
+ if messages and input_ids:
+ raise ValueError("Only one of messages or input_ids can be provided.")
+
+ if messages:
+ # normalize input
+ prompt = self.create_prompt_for_chat(messages)
+ completion_response = self.step_generate(prompt=prompt, **kwargs)
+ else:
+ completion_response = self.step_generate(input_ids=input_ids, **kwargs)
+ # normalize output
+ for response in completion_response:
+ choices = response.pop('choices')
+ yield {
+ 'choices': [
+ {
+ 'index': 0,
+ 'message': {'role': 'assistant', 'content': choices[0]['text']},
+ 'finish_reason': choices[0]['finish_reason'],
+ }
+ ],
+ **response,
+ }
diff --git a/open_gpt/models/embedding.py b/run_gpt/models/embedding.py
similarity index 100%
rename from open_gpt/models/embedding.py
rename to run_gpt/models/embedding.py
diff --git a/open_gpt/models/flamingo/__init__.py b/run_gpt/models/flamingo/__init__.py
similarity index 100%
rename from open_gpt/models/flamingo/__init__.py
rename to run_gpt/models/flamingo/__init__.py
diff --git a/open_gpt/models/flamingo/config.json b/run_gpt/models/flamingo/config.json
similarity index 100%
rename from open_gpt/models/flamingo/config.json
rename to run_gpt/models/flamingo/config.json
diff --git a/open_gpt/models/flamingo/configuration_flamingo.py b/run_gpt/models/flamingo/configuration_flamingo.py
similarity index 100%
rename from open_gpt/models/flamingo/configuration_flamingo.py
rename to run_gpt/models/flamingo/configuration_flamingo.py
diff --git a/open_gpt/models/flamingo/flamingo_lm.py b/run_gpt/models/flamingo/flamingo_lm.py
similarity index 100%
rename from open_gpt/models/flamingo/flamingo_lm.py
rename to run_gpt/models/flamingo/flamingo_lm.py
diff --git a/open_gpt/models/flamingo/flamingo_model.py b/run_gpt/models/flamingo/flamingo_model.py
similarity index 99%
rename from open_gpt/models/flamingo/flamingo_model.py
rename to run_gpt/models/flamingo/flamingo_model.py
index 92e9abb..fa8cebe 100644
--- a/open_gpt/models/flamingo/flamingo_model.py
+++ b/run_gpt/models/flamingo/flamingo_model.py
@@ -5,7 +5,7 @@
from open_flamingo.src.helpers import PerceiverResampler
from torch import nn
-from open_gpt.logs import logger
+from run_gpt.logs import logger
from ...helper import auto_dtype_and_device
diff --git a/open_gpt/models/flamingo/loading.py b/run_gpt/models/flamingo/loading.py
similarity index 99%
rename from open_gpt/models/flamingo/loading.py
rename to run_gpt/models/flamingo/loading.py
index 982ace1..88c8dd0 100644
--- a/open_gpt/models/flamingo/loading.py
+++ b/run_gpt/models/flamingo/loading.py
@@ -4,7 +4,7 @@
import torch
from open_flamingo.src.utils import extend_instance
-from open_gpt.logs import logger
+from run_gpt.logs import logger
def load_model_and_transforms(
diff --git a/open_gpt/models/flamingo/modeling.py b/run_gpt/models/flamingo/modeling.py
similarity index 98%
rename from open_gpt/models/flamingo/modeling.py
rename to run_gpt/models/flamingo/modeling.py
index 7b312dc..d47e270 100644
--- a/open_gpt/models/flamingo/modeling.py
+++ b/run_gpt/models/flamingo/modeling.py
@@ -3,7 +3,7 @@
import torch
from PIL import Image
-from open_gpt.models.modeling import BaseModel
+from run_gpt.models.modeling import BaseModel
class FlamingoModel(BaseModel):
diff --git a/open_gpt/models/generation.py b/run_gpt/models/generation.py
similarity index 100%
rename from open_gpt/models/generation.py
rename to run_gpt/models/generation.py
diff --git a/open_gpt/models/llama/__init__.py b/run_gpt/models/llama/__init__.py
similarity index 100%
rename from open_gpt/models/llama/__init__.py
rename to run_gpt/models/llama/__init__.py
diff --git a/open_gpt/models/llama/loading.py b/run_gpt/models/llama/loading.py
similarity index 95%
rename from open_gpt/models/llama/loading.py
rename to run_gpt/models/llama/loading.py
index 6adf371..1ae5069 100644
--- a/open_gpt/models/llama/loading.py
+++ b/run_gpt/models/llama/loading.py
@@ -2,8 +2,8 @@
import torch
-from open_gpt.helper import get_device_map
-from open_gpt.logs import logger
+from run_gpt.helper import get_device_map
+from run_gpt.logs import logger
def load_model_and_tokenizer(
@@ -51,7 +51,7 @@ def load_model_and_tokenizer(
from packaging import version
from transformers import BitsAndBytesConfig
- from open_gpt import importlib_metadata
+ from run_gpt import importlib_metadata
trf_version = importlib_metadata.version("transformers")
if 'dev' in trf_version:
diff --git a/open_gpt/models/llama/modeling.py b/run_gpt/models/llama/modeling.py
similarity index 94%
rename from open_gpt/models/llama/modeling.py
rename to run_gpt/models/llama/modeling.py
index 679d967..b5e1ed3 100644
--- a/open_gpt/models/llama/modeling.py
+++ b/run_gpt/models/llama/modeling.py
@@ -2,7 +2,7 @@
import torch
-from open_gpt.models.modeling import BaseModel
+from run_gpt.models.modeling import BaseModel
class LlamaModel(BaseModel):
diff --git a/open_gpt/models/loading.py b/run_gpt/models/loading.py
similarity index 95%
rename from open_gpt/models/loading.py
rename to run_gpt/models/loading.py
index 9731de1..847275d 100644
--- a/open_gpt/models/loading.py
+++ b/run_gpt/models/loading.py
@@ -2,8 +2,8 @@
import torch
-from open_gpt.helper import _DEFAULT_FP16_DTYPE, get_device_map
-from open_gpt.logs import logger
+from run_gpt.helper import _DEFAULT_FP16_DTYPE, get_device_map
+from run_gpt.logs import logger
def load_model_and_tokenizer(
@@ -50,7 +50,7 @@ def load_model_and_tokenizer(
elif precision == 'bit4':
from packaging import version
- from open_gpt import importlib_metadata
+ from run_gpt import importlib_metadata
trf_version = importlib_metadata.version("transformers")
if 'dev' in trf_version:
diff --git a/open_gpt/models/modeling.py b/run_gpt/models/modeling.py
similarity index 100%
rename from open_gpt/models/modeling.py
rename to run_gpt/models/modeling.py
diff --git a/open_gpt/models/moss/__init__.py b/run_gpt/models/moss/__init__.py
similarity index 100%
rename from open_gpt/models/moss/__init__.py
rename to run_gpt/models/moss/__init__.py
diff --git a/open_gpt/models/moss/modeling.py b/run_gpt/models/moss/modeling.py
similarity index 93%
rename from open_gpt/models/moss/modeling.py
rename to run_gpt/models/moss/modeling.py
index 0396e24..cb2f7ff 100644
--- a/open_gpt/models/moss/modeling.py
+++ b/run_gpt/models/moss/modeling.py
@@ -1,7 +1,7 @@
from typing import List
-from open_gpt.models.modeling import BaseModel
-from open_gpt.logs import logger
+from run_gpt.logs import logger
+from run_gpt.models.modeling import BaseModel
class MossModel(BaseModel):
@@ -16,9 +16,9 @@ class MossModel(BaseModel):
The quick usage is summarized in the snippet below:
```python
- import open_gpt
+ import run_gpt
- model = open_gpt.create_model(
+ model = run_gpt.create_model(
'fnlp/moss-moon-003-sft', precision='fp16', device_map='balanced'
)
@@ -75,7 +75,9 @@ def create_prompt_for_chat(self, messages: List[dict]) -> str:
content = message['content']
if role == 'system':
- logger.warning('System message detected, but MOSS has a specific system instruction, will skip ...')
+ logger.warning(
+ 'System message detected, but MOSS has a specific system instruction, will skip ...'
+ )
elif role == 'user':
string_messages += f'<|Human|>: {content}\n'
elif role == 'assistant':
diff --git a/open_gpt/models/pythia/__init__.py b/run_gpt/models/pythia/__init__.py
similarity index 100%
rename from open_gpt/models/pythia/__init__.py
rename to run_gpt/models/pythia/__init__.py
diff --git a/open_gpt/models/pythia/modeling.py b/run_gpt/models/pythia/modeling.py
similarity index 74%
rename from open_gpt/models/pythia/modeling.py
rename to run_gpt/models/pythia/modeling.py
index bfe138d..ada5e9e 100644
--- a/open_gpt/models/pythia/modeling.py
+++ b/run_gpt/models/pythia/modeling.py
@@ -1,4 +1,4 @@
-from open_gpt.models.modeling import BaseModel
+from run_gpt.models.modeling import BaseModel
class PythiaModel(BaseModel):
@@ -9,12 +9,12 @@ class PythiaModel(BaseModel):
See https://github.com/EleutherAI/pythia for more information.
- The quick way to use Pythia via :meth:`open_gpt.create_model`:
+ The quick way to use Pythia via :meth:`run_gpt.create_model`:
```python
- import open_gpt
+ import run_gpt
- model = open_gpt.create_model(
+ model = run_gpt.create_model(
'EleutherAI/pythia-12b-deduped', precision='fp16', device_map='balanced'
)
```
diff --git a/open_gpt/models/rwkv/__init__.py b/run_gpt/models/rwkv/__init__.py
similarity index 100%
rename from open_gpt/models/rwkv/__init__.py
rename to run_gpt/models/rwkv/__init__.py
diff --git a/open_gpt/models/rwkv/modeling.py b/run_gpt/models/rwkv/modeling.py
similarity index 96%
rename from open_gpt/models/rwkv/modeling.py
rename to run_gpt/models/rwkv/modeling.py
index a573c2f..639873c 100644
--- a/open_gpt/models/rwkv/modeling.py
+++ b/run_gpt/models/rwkv/modeling.py
@@ -3,7 +3,7 @@
import torch
import torch.nn.functional as F
-from open_gpt.models.modeling import BaseModel
+from run_gpt.models.modeling import BaseModel
class RWKVModel(BaseModel):
diff --git a/open_gpt/models/session.py b/run_gpt/models/session.py
similarity index 100%
rename from open_gpt/models/session.py
rename to run_gpt/models/session.py
diff --git a/open_gpt/models/stablelm/__init__.py b/run_gpt/models/stablelm/__init__.py
similarity index 100%
rename from open_gpt/models/stablelm/__init__.py
rename to run_gpt/models/stablelm/__init__.py
diff --git a/open_gpt/models/stablelm/modeling.py b/run_gpt/models/stablelm/modeling.py
similarity index 87%
rename from open_gpt/models/stablelm/modeling.py
rename to run_gpt/models/stablelm/modeling.py
index 4b47362..1159a82 100644
--- a/open_gpt/models/stablelm/modeling.py
+++ b/run_gpt/models/stablelm/modeling.py
@@ -10,8 +10,8 @@
import torch
from transformers import StoppingCriteria, StoppingCriteriaList
-from open_gpt.models.modeling import BaseModel
-from open_gpt.logs import logger
+from run_gpt.logs import logger
+from run_gpt.models.modeling import BaseModel
class StopOnTokens(StoppingCriteria):
@@ -30,12 +30,12 @@ class StableLMModel(BaseModel):
See https://github.com/Stability-AI/StableLM for more details.
- The quick way to use StableLM via :meth:`open_gpt.create_model`:
+ The quick way to use StableLM via :meth:`run_gpt.create_model`:
```python
- import open_gpt
+ import run_gpt
- model = open_gpt.create_model('stabilityai/stablelm-tuned-alpha-7b')
+ model = run_gpt.create_model('stabilityai/stablelm-tuned-alpha-7b')
system_prompt = (
'<|SYSTEM|># StableLM Tuned (Alpha version)\n'
@@ -63,7 +63,7 @@ class StableLMModel(BaseModel):
)
# Generate text with StableLM-StableVicuna-13B
- model = open_gpt.create_model('CarperAI/stable-vicuna-13b-delta')
+ model = run_gpt.create_model('CarperAI/stable-vicuna-13b-delta')
```
"""
@@ -89,7 +89,7 @@ def generate(self, prompts: Union[str, List[str]], **kwargs):
if not self.is_vicuna_model
else None,
skip_special_tokens=False,
- **kwargs
+ **kwargs,
)
def create_prompt_for_chat(self, messages: List[dict]) -> str:
@@ -99,7 +99,9 @@ def create_prompt_for_chat(self, messages: List[dict]) -> str:
content = message['content']
if role == 'system':
- logger.warning('System message detected, but StableLM has a specific system instruction, will skip ...')
+ logger.warning(
+ 'System message detected, but StableLM has a specific system instruction, will skip ...'
+ )
elif role == 'user':
string_messages += f'<|USER|>{content}'
elif role == 'assistant':
diff --git a/open_gpt/models/vicuna/__init__.py b/run_gpt/models/vicuna/__init__.py
similarity index 100%
rename from open_gpt/models/vicuna/__init__.py
rename to run_gpt/models/vicuna/__init__.py
diff --git a/open_gpt/models/vicuna/loading.py b/run_gpt/models/vicuna/loading.py
similarity index 98%
rename from open_gpt/models/vicuna/loading.py
rename to run_gpt/models/vicuna/loading.py
index 2491dae..34e9503 100644
--- a/open_gpt/models/vicuna/loading.py
+++ b/run_gpt/models/vicuna/loading.py
@@ -2,7 +2,7 @@
import torch
-from open_gpt.logs import logger
+from run_gpt.logs import logger
def load_model_and_tokenizer(
@@ -26,7 +26,7 @@ def load_model_and_tokenizer(
if precision in ['bit4', 'bit8']:
from packaging import version
- from open_gpt import importlib_metadata
+ from run_gpt import importlib_metadata
trf_version = importlib_metadata.version("transformers")
if 'dev' in trf_version:
diff --git a/open_gpt/models/vicuna/modeling.py b/run_gpt/models/vicuna/modeling.py
similarity index 86%
rename from open_gpt/models/vicuna/modeling.py
rename to run_gpt/models/vicuna/modeling.py
index d7cdd61..845f490 100644
--- a/open_gpt/models/vicuna/modeling.py
+++ b/run_gpt/models/vicuna/modeling.py
@@ -2,7 +2,7 @@
import torch
-from open_gpt.models.llama.modeling import LlamaModel
+from run_gpt.models.llama.modeling import LlamaModel
class VicunaModel(LlamaModel):
@@ -11,12 +11,12 @@ class VicunaModel(LlamaModel):
Vicuna is trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
See https://vicuna.lmsys.org/ for more details.
- The quick way to use Vicuna via :meth:`open_gpt.create_model`:
+ The quick way to use Vicuna via :meth:`run_gpt.create_model`:
```python
- import open_gpt
+ import run_gpt
- model = open_gpt.create_model('lmsys/vicuna-7b-delta-v1.1')
+ model = run_gpt.create_model('lmsys/vicuna-7b-delta-v1.1')
# Generate text
text_out = model.generate_text(prompts='Hello, my name is', max_length=50)
@@ -25,9 +25,9 @@ class VicunaModel(LlamaModel):
If you want to run inference with lower precision and/or on a specific device, you can do:
```python
- import open_gpt
+ import run_gpt
- model = open_gpt.create_model(
+ model = run_gpt.create_model(
'lmsys/vicuna-7b-delta-v1.1', precision='fp16', device_map='balanced'
)
```
diff --git a/open_gpt/profile.py b/run_gpt/profile.py
similarity index 93%
rename from open_gpt/profile.py
rename to run_gpt/profile.py
index 2dedf84..8f3887c 100644
--- a/open_gpt/profile.py
+++ b/run_gpt/profile.py
@@ -117,13 +117,11 @@ def end_measure(start_measures):
# GPU mem
for i in range(torch.cuda.device_count()):
measures[str(i)] = (
- torch.cuda.memory_allocated(i) - start_measures[
- str(i)]
- ) / GB
+ torch.cuda.memory_allocated(i) - start_measures[str(i)]
+ ) / GB
measures[f"{i}-peak"] = (
- torch.cuda.max_memory_allocated(i) -
- start_measures[str(i)]
- ) / GB
+ torch.cuda.max_memory_allocated(i) - start_measures[str(i)]
+ ) / GB
return measures
@@ -167,8 +165,12 @@ def end_record(self, generation_outputs: Union[str, List[str]]):
)
else:
num_tokens = sum(
- list(map(lambda x: len(self._tokenizer(x)['input_ids']) - 2,
- generation_outputs))
+ list(
+ map(
+ lambda x: len(self._tokenizer(x)['input_ids']) - 2,
+ generation_outputs,
+ )
+ )
)
self._generation_length.append(num_tokens)
self._time_stamp = None
diff --git a/open_gpt/resources/flow.yml.jinja2 b/run_gpt/resources/flow.yml.jinja2
similarity index 94%
rename from open_gpt/resources/flow.yml.jinja2
rename to run_gpt/resources/flow.yml.jinja2
index b0561d0..aca4ace 100644
--- a/open_gpt/resources/flow.yml.jinja2
+++ b/run_gpt/resources/flow.yml.jinja2
@@ -16,7 +16,7 @@ gateway:
uses:
jtype: {{ gateway_module }}
py_modules:
- - open_gpt.serve.gateway
+ - run_gpt.serve.gateway
{% else -%}
uses: {{ gateway_image }}
{% endif -%}
@@ -36,7 +36,7 @@ executors:
uses:
jtype: {{ executor_module }}
py_modules:
- - open_gpt.serve.executors
+ - run_gpt.serve.executors
{% else -%}
uses: {{ executor_image }}
{% endif -%}
diff --git a/open_gpt/serve/__init__.py b/run_gpt/serve/__init__.py
similarity index 100%
rename from open_gpt/serve/__init__.py
rename to run_gpt/serve/__init__.py
diff --git a/open_gpt/serve/executors/__init__.py b/run_gpt/serve/executors/__init__.py
similarity index 100%
rename from open_gpt/serve/executors/__init__.py
rename to run_gpt/serve/executors/__init__.py
diff --git a/open_gpt/serve/executors/base.py b/run_gpt/serve/executors/base.py
similarity index 98%
rename from open_gpt/serve/executors/base.py
rename to run_gpt/serve/executors/base.py
index 964d02c..2783d9b 100644
--- a/open_gpt/serve/executors/base.py
+++ b/run_gpt/serve/executors/base.py
@@ -6,8 +6,8 @@
from docarray import DocumentArray
from jina import Executor, requests
-from open_gpt.factory import create_model
-from open_gpt.logs import logger
+from run_gpt.factory import create_model
+from run_gpt.logs import logger
class CausualLMExecutor(Executor):
diff --git a/open_gpt/serve/executors/flamingo.py b/run_gpt/serve/executors/flamingo.py
similarity index 96%
rename from open_gpt/serve/executors/flamingo.py
rename to run_gpt/serve/executors/flamingo.py
index f988f5b..aa3cd01 100644
--- a/open_gpt/serve/executors/flamingo.py
+++ b/run_gpt/serve/executors/flamingo.py
@@ -7,8 +7,8 @@
from docarray import DocumentArray
from jina import Executor, requests
-import open_gpt
-from open_gpt.logs import logger
+import run_gpt
+from run_gpt.logs import logger
class FlamingoExecutor(Executor):
@@ -38,7 +38,7 @@ def __init__(
'`device_map` is not supported in FlamingoExecutor. Ignored.'
)
- self.model = open_gpt.create_model(
+ self.model = run_gpt.create_model(
model_name_or_path, precision=precision, device_map=None, **kwargs
)
diff --git a/open_gpt/serve/executors/utils.py b/run_gpt/serve/executors/utils.py
similarity index 100%
rename from open_gpt/serve/executors/utils.py
rename to run_gpt/serve/executors/utils.py
diff --git a/open_gpt/serve/flow.py b/run_gpt/serve/flow.py
similarity index 93%
rename from open_gpt/serve/flow.py
rename to run_gpt/serve/flow.py
index e24133d..a17a2c7 100644
--- a/open_gpt/serve/flow.py
+++ b/run_gpt/serve/flow.py
@@ -7,7 +7,7 @@
@cache
def get_template(template):
if template.endswith('.jinja2'):
- from open_gpt import __resources_path__
+ from run_gpt import __resources_path__
env = Environment(loader=FileSystemLoader(__resources_path__))
return env.get_template(template)
diff --git a/open_gpt/serve/gateway.py b/run_gpt/serve/gateway.py
similarity index 73%
rename from open_gpt/serve/gateway.py
rename to run_gpt/serve/gateway.py
index aa007d7..fc08077 100644
--- a/open_gpt/serve/gateway.py
+++ b/run_gpt/serve/gateway.py
@@ -1,13 +1,19 @@
"""The serve module provides a simple way to serve a model using Jina."""
+from datetime import datetime
+
import jina
+from fastapi.encoders import jsonable_encoder
from jina import Document, DocumentArray
from jina import Gateway as BaseGateway
from jina.serve.runtimes.servers.composite import CompositeServer
-from .pydantic_models import GenerateRequest, ChatRequest, BaseResponse, ResponseObjectEnum
-from fastapi.encoders import jsonable_encoder
-from datetime import datetime
+from .pydantic_models import (
+ BaseResponse,
+ ChatRequest,
+ GenerateRequest,
+ ResponseObjectEnum,
+)
class Gateway(BaseGateway, CompositeServer):
@@ -47,15 +53,15 @@ async def generate(payload: GenerateRequest = Body(...)):
parameters = _update_key(parameters)
async for docs, error in self.streamer.stream(
- docs=DocumentArray(
- [
- Document(
- tags={'prompt': payload.prompt},
- )
- ]
- ),
- exec_endpoint='/generate',
- parameters=parameters,
+ docs=DocumentArray(
+ [
+ Document(
+ tags={'prompt': payload.prompt},
+ )
+ ]
+ ),
+ exec_endpoint='/generate',
+ parameters=parameters,
):
if error:
return JSONResponse(
@@ -68,9 +74,13 @@ async def generate(payload: GenerateRequest = Body(...)):
return JSONResponse(
status_code=status.HTTP_200_OK,
- content=jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.GENERATION,
- created=int(datetime.now().timestamp()))
- )
+ content=jsonable_encoder(
+ BaseResponse(
+ **_tags,
+ object=ResponseObjectEnum.GENERATION,
+ created=int(datetime.now().timestamp())
+ )
+ ),
)
@app.api_route(path='/generate_stream', methods=['POST'])
@@ -97,9 +107,9 @@ async def event_generator():
parameters['completion_tokens'] = completion_tokens
async for docs, error in self.streamer.stream(
- docs=input_docs,
- exec_endpoint='/generate_stream',
- parameters=parameters,
+ docs=input_docs,
+ exec_endpoint='/generate_stream',
+ parameters=parameters,
):
if error:
# TODO: find best practice to handle errors in sse
@@ -113,9 +123,9 @@ async def event_generator():
stop_flag = docs[0].tags.get('choices')[0].get(
'finish_reason'
) in [
- 'stop',
- 'length',
- ]
+ 'stop',
+ 'length',
+ ]
completion_tokens += 1
_tags = docs[0].tags.copy()
@@ -124,9 +134,15 @@ async def event_generator():
_tags['usage'] = {
k: int(v) for k, v in _tags['usage'].items()
}
- yield {"data": jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.GENERATION,
- created=int(datetime.now().timestamp()))
- )}
+ yield {
+ "data": jsonable_encoder(
+ BaseResponse(
+ **_tags,
+ object=ResponseObjectEnum.GENERATION,
+ created=int(datetime.now().timestamp())
+ )
+ )
+ }
input_docs = DocumentArray(
[
@@ -151,15 +167,15 @@ async def chat(payload: ChatRequest = Body(...)):
parameters = _update_key(parameters)
async for docs, error in self.streamer.stream(
- docs=DocumentArray(
- [
- Document(
- tags={'prompt': payload.messages},
- )
- ]
- ),
- exec_endpoint='/chat',
- parameters=parameters,
+ docs=DocumentArray(
+ [
+ Document(
+ tags={'prompt': payload.messages},
+ )
+ ]
+ ),
+ exec_endpoint='/chat',
+ parameters=parameters,
):
if error:
return JSONResponse(
@@ -172,9 +188,13 @@ async def chat(payload: ChatRequest = Body(...)):
return JSONResponse(
status_code=status.HTTP_200_OK,
- content=jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.CHAT,
- created=int(datetime.now().timestamp()))
- ),
+ content=jsonable_encoder(
+ BaseResponse(
+ **_tags,
+ object=ResponseObjectEnum.CHAT,
+ created=int(datetime.now().timestamp())
+ )
+ ),
)
@app.api_route(path='/chat_stream', methods=['POST'])
@@ -201,9 +221,9 @@ async def event_generator():
parameters['completion_tokens'] = completion_tokens
async for docs, error in self.streamer.stream(
- docs=input_docs,
- exec_endpoint='/chat_stream',
- parameters=parameters,
+ docs=input_docs,
+ exec_endpoint='/chat_stream',
+ parameters=parameters,
):
if error:
# TODO: find best practice to handle errors in sse
@@ -217,9 +237,9 @@ async def event_generator():
stop_flag = docs[0].tags.get('choices')[0].get(
'finish_reason'
) in [
- 'stop',
- 'length',
- ]
+ 'stop',
+ 'length',
+ ]
completion_tokens += 1
_tags = docs[0].tags.copy()
@@ -228,9 +248,15 @@ async def event_generator():
_tags['usage'] = {
k: int(v) for k, v in _tags['usage'].items()
}
- yield {"data": jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.CHAT,
- created=int(datetime.now().timestamp()))
- )}
+ yield {
+ "data": jsonable_encoder(
+ BaseResponse(
+ **_tags,
+ object=ResponseObjectEnum.CHAT,
+ created=int(datetime.now().timestamp())
+ )
+ )
+ }
input_docs = DocumentArray(
[
diff --git a/open_gpt/serve/playground/__init__.py b/run_gpt/serve/playground/__init__.py
similarity index 100%
rename from open_gpt/serve/playground/__init__.py
rename to run_gpt/serve/playground/__init__.py
diff --git a/open_gpt/serve/playground/gradio.py b/run_gpt/serve/playground/gradio.py
similarity index 98%
rename from open_gpt/serve/playground/gradio.py
rename to run_gpt/serve/playground/gradio.py
index 58b1b28..54e5899 100644
--- a/open_gpt/serve/playground/gradio.py
+++ b/run_gpt/serve/playground/gradio.py
@@ -23,7 +23,7 @@
title_markdown = """
# ☄️ Chat with Large-scale Multimodal Models
-[[Project Page]](https://opengpt.github.io) [[Code]](https://github.com/jinaai/opengpt)
+[[Project Page]](https://rungpt.github.io) [[Code]](https://github.com/jinaai/rungpt)
"""
_get_window_url_params = """
diff --git a/open_gpt/serve/playground/gradio_chatbot.py b/run_gpt/serve/playground/gradio_chatbot.py
similarity index 100%
rename from open_gpt/serve/playground/gradio_chatbot.py
rename to run_gpt/serve/playground/gradio_chatbot.py
diff --git a/open_gpt/serve/playground/gradio_css.py b/run_gpt/serve/playground/gradio_css.py
similarity index 100%
rename from open_gpt/serve/playground/gradio_css.py
rename to run_gpt/serve/playground/gradio_css.py
diff --git a/open_gpt/serve/pydantic_models.py b/run_gpt/serve/pydantic_models.py
similarity index 54%
rename from open_gpt/serve/pydantic_models.py
rename to run_gpt/serve/pydantic_models.py
index abe11f9..dc5839d 100644
--- a/open_gpt/serve/pydantic_models.py
+++ b/run_gpt/serve/pydantic_models.py
@@ -1,6 +1,7 @@
-from pydantic import BaseModel, Field
from enum import Enum
-from typing import Union, List, Tuple, Any
+from typing import Any, List, Tuple, Union
+
+from pydantic import BaseModel, Field
class BaseRequest(BaseModel):
@@ -25,29 +26,41 @@ class BaseRequest(BaseModel):
)
logprobs: int = Field(
description='Include the log probabilities on the logprobs '
- 'most likely tokens, as well the chosen tokens',
+ 'most likely tokens, as well the chosen tokens',
default=None,
)
echo: bool = Field(
description='Echo back the prompt in the completion.', default=None
)
- stop: Union[str, List[str]] = Field(description='Stop sequence generation on token.', default=None)
- stop_str: Union[str, List[str]] = Field(description='Stop sequence generation on token.', default=None)
+ stop: Union[str, List[str]] = Field(
+ description='Stop sequence generation on token.', default=None
+ )
+ stop_str: Union[str, List[str]] = Field(
+ description='Stop sequence generation on token.', default=None
+ )
do_sample: bool = Field(
description='Whether to sample from the generation.', default=None
)
- presence_penalty: float = Field(description='Positive values penalize new tokens based on whether they appear in '
- 'the text so far, increasing the likelihood to talk about new topics.',
- default=0)
- frequency_penalty: float = Field(description='Positive values penalize new tokens based on their existing '
- 'frequency in the text so far, decreasing the likelihood to repeat '
- 'the same line verbatim.',
- default=0)
- best_of: int = Field(description='Generates best_of completions server-side and returns the "best" (the one with '
- 'the highest log probability per token). Results cannot be streamed.',
- default=None)
+ presence_penalty: float = Field(
+ description='Positive values penalize new tokens based on whether they appear in '
+ 'the text so far, increasing the likelihood to talk about new topics.',
+ default=0,
+ )
+ frequency_penalty: float = Field(
+ description='Positive values penalize new tokens based on their existing '
+ 'frequency in the text so far, decreasing the likelihood to repeat '
+ 'the same line verbatim.',
+ default=0,
+ )
+ best_of: int = Field(
+ description='Generates best_of completions server-side and returns the "best" (the one with '
+ 'the highest log probability per token). Results cannot be streamed.',
+ default=None,
+ )
n: int = Field(description='The number of sequences to return.', default=None)
- num_return_sequences: int = Field(description='The number of sequences to return.', default=None)
+ num_return_sequences: int = Field(
+ description='The number of sequences to return.', default=None
+ )
class GenerateRequest(BaseRequest):
@@ -77,7 +90,7 @@ class Config:
'best_of': 5,
'logprobs': None,
'n': 3,
- 'num_return_sequences': 3
+ 'num_return_sequences': 3,
}
}
@@ -91,8 +104,10 @@ class Config:
schema_extra = {
'example': {
- 'messages': [{"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": "Hello!"}],
+ 'messages': [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": "Hello!"},
+ ],
'id': '18d92585-7b66-4b7c-b818-71287c122c57',
'num_beams': 5,
'max_tokens': 50,
@@ -110,7 +125,7 @@ class Config:
'best_of': 5,
'logprobs': None,
'n': 3,
- 'num_return_sequences': 3
+ 'num_return_sequences': 3,
}
}
@@ -124,18 +139,26 @@ class BaseResponse(BaseModel):
# session id
id: str = Field(description='The session id of the generation.', default=None)
- object: ResponseObjectEnum = Field(description='The task type of the response.', default=None)
+ object: ResponseObjectEnum = Field(
+ description='The task type of the response.', default=None
+ )
created: int = Field(description='The timestamp of the response.', default=None)
- choices: List[dict] = Field(description='The generated text. It contains 5 keys: `index`, `text`, `message`, `logprobs`, '
- '`finish_reason`. For generation mode, `message` is None. For chat mode, '
- '`text` is None.')
- prompt: str = Field(description='The prompt used to generate the response.', default=None)
- usage: dict = Field(description='The usage of the model. It contains 3 keys: `prompt_tokens`, '
- '`completion_tokens`, `total_tokens`. `prompt_tokens` is the length of input, '
- 'in streaming mode this also includes the length of past_key_values. '
- '`completion_tokens` is the length of the generated text, in streaming mode this '
- 'also includes the length of text generated in previous steps.'
- '`total_tokens` is the total length of the `prompt_tokens` and `completion_tokens`.')
+ choices: List[dict] = Field(
+ description='The generated text. It contains 5 keys: `index`, `text`, `message`, `logprobs`, '
+ '`finish_reason`. For generation mode, `message` is None. For chat mode, '
+ '`text` is None.'
+ )
+ prompt: str = Field(
+ description='The prompt used to generate the response.', default=None
+ )
+ usage: dict = Field(
+ description='The usage of the model. It contains 3 keys: `prompt_tokens`, '
+ '`completion_tokens`, `total_tokens`. `prompt_tokens` is the length of input, '
+ 'in streaming mode this also includes the length of past_key_values. '
+ '`completion_tokens` is the length of the generated text, in streaming mode this '
+ 'also includes the length of text generated in previous steps.'
+ '`total_tokens` is the total length of the `prompt_tokens` and `completion_tokens`.'
+ )
class Config:
allow_population_by_field_name = True
@@ -146,15 +169,24 @@ class Config:
'id': '18d92585-7b66-4b7c-b818-71287c122c57',
'object': 'chat.completion',
'create': 12345678,
- 'choices': [{"index": 0,
- "text": None,
- "message": {
- "role": "assistant",
- "content": "\n\nHello there, how may I assist you today?",
- },
- "logprobs": None,
- "finish_reason": "length"}],
+ 'choices': [
+ {
+ "index": 0,
+ "text": None,
+ "message": {
+ "role": "assistant",
+ "content": "\n\nHello there, how may I assist you today?",
+ },
+ "logprobs": None,
+ "finish_reason": "length",
+ }
+ ],
'prompt': 'Hello there.',
- 'usage': {'prompt_tokens': 0, 'input_length': 10, 'completion_tokens': 10, 'total_tokens': 20},
+ 'usage': {
+ 'prompt_tokens': 0,
+ 'input_length': 10,
+ 'completion_tokens': 10,
+ 'total_tokens': 20,
+ },
}
}
diff --git a/open_gpt/spqr/datautils.py b/run_gpt/spqr/datautils.py
similarity index 90%
rename from open_gpt/spqr/datautils.py
rename to run_gpt/spqr/datautils.py
index 0a89435..7caf7da 100644
--- a/open_gpt/spqr/datautils.py
+++ b/run_gpt/spqr/datautils.py
@@ -61,10 +61,16 @@ def get_c4(nsamples, seed, seqlen, model_path):
from datasets import load_dataset
traindata = load_dataset(
- "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train"
+ "allenai/c4",
+ "allenai--c4",
+ data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
+ split="train",
)
valdata = load_dataset(
- "allenai/c4", "allenai--c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation"
+ "allenai/c4",
+ "allenai--c4",
+ data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
+ split="validation",
)
tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
@@ -138,10 +144,16 @@ def get_c4_new(nsamples, seed, seqlen, model_path):
from datasets import load_dataset
traindata = load_dataset(
- "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train"
+ "allenai/c4",
+ "allenai--c4",
+ data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
+ split="train",
)
valdata = load_dataset(
- "allenai/c4", "allenai--c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation"
+ "allenai/c4",
+ "allenai--c4",
+ data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
+ split="validation",
)
tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
diff --git a/open_gpt/spqr/quant_groups.py b/run_gpt/spqr/quant_groups.py
similarity index 83%
rename from open_gpt/spqr/quant_groups.py
rename to run_gpt/spqr/quant_groups.py
index 9153923..d5c8812 100644
--- a/open_gpt/spqr/quant_groups.py
+++ b/run_gpt/spqr/quant_groups.py
@@ -95,17 +95,37 @@ def find_params(self, x, weight=False):
if self.qq_scale_bits is not None:
scale_groups = self.scale.reshape(-1, self.qq_groupsize)
self.qq_scale = Quantizer(shape=scale_groups.shape)
- self.qq_scale.configure(self.qq_scale_bits, perchannel=True, sym=False, round_zero=False, **self.qqq_params)
+ self.qq_scale.configure(
+ self.qq_scale_bits,
+ perchannel=True,
+ sym=False,
+ round_zero=False,
+ **self.qqq_params
+ )
self.qq_scale.find_params(scale_groups, weight=True)
- assert self.qq_scale.scale.shape == (scale_groups.shape[0], 1), self.qq_scale.scale.shape
+ assert self.qq_scale.scale.shape == (
+ scale_groups.shape[0],
+ 1,
+ ), self.qq_scale.scale.shape
self.scale = self.qq_scale.quantize(scale_groups).reshape_as(self.scale)
- if self.qq_zero_bits is not None and ((not self.round_zero) or self.qq_zero_bits < self.bits):
+ if self.qq_zero_bits is not None and (
+ (not self.round_zero) or self.qq_zero_bits < self.bits
+ ):
zero_groups = self.zero.reshape(-1, self.qq_groupsize)
self.qq_zero = Quantizer(shape=zero_groups.shape)
- self.qq_zero.configure(self.qq_zero_bits, perchannel=True, sym=self.qq_zero_sym, round_zero=False, **self.qqq_params)
+ self.qq_zero.configure(
+ self.qq_zero_bits,
+ perchannel=True,
+ sym=self.qq_zero_sym,
+ round_zero=False,
+ **self.qqq_params
+ )
self.qq_zero.find_params(zero_groups, weight=True)
- assert self.qq_zero.scale.shape == (zero_groups.shape[0], 1), self.qq_zero.scale.shape
+ assert self.qq_zero.scale.shape == (
+ zero_groups.shape[0],
+ 1,
+ ), self.qq_zero.scale.shape
self.zero = self.qq_zero.quantize(zero_groups).reshape_as(self.zero)
if weight:
diff --git a/open_gpt/spqr/quantize.py b/run_gpt/spqr/quantize.py
similarity index 77%
rename from open_gpt/spqr/quantize.py
rename to run_gpt/spqr/quantize.py
index 3fcefd5..5157e89 100644
--- a/open_gpt/spqr/quantize.py
+++ b/run_gpt/spqr/quantize.py
@@ -1,10 +1,11 @@
# Adapted from https://github.com/Vahe1994/SpQR
import time
-from quantizeargs import QuantizeArgs
+
+import huggingface_hub
from datautils import *
+from quantizeargs import QuantizeArgs
from quantutils import *
-import huggingface_hub
def apply_quantize(args: QuantizeArgs, quantized_model_path: str = None):
@@ -17,7 +18,11 @@ def apply_quantize(args: QuantizeArgs, quantized_model_path: str = None):
else:
assert args.dataset != "custom"
dataloader, testloader = get_loaders(
- args.dataset, nsamples=args.nsamples, seed=args.seed, model_path=args.model_path, seqlen=model.seqlen
+ args.dataset,
+ nsamples=args.nsamples,
+ seed=args.seed,
+ model_path=args.model_path,
+ seqlen=model.seqlen,
)
device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -37,7 +42,9 @@ def test(model_path):
datasets = ["wikitext2", "ptb", "c4"]
for dataset in datasets:
- dataloader, testloader = get_loaders(dataset, seed=0, model_path=model_path, seqlen=model.seqlen)
+ dataloader, testloader = get_loaders(
+ dataset, seed=0, model_path=model_path, seqlen=model.seqlen
+ )
print(dataset)
llama_eval(model, testloader, device)
@@ -46,6 +53,7 @@ def quant(model_name, quantized_model_path):
model_path = huggingface_hub.snapshot_download(model_name)
before_args = QuantizeArgs(model_name=model_name, model_path=model_path)
apply_quantize(before_args, quantized_model_path)
- quantized_args = QuantizeArgs(model_name=model_name,model_path=quantized_model_path)
+ quantized_args = QuantizeArgs(
+ model_name=model_name, model_path=quantized_model_path
+ )
return before_args, quantized_args
-
diff --git a/run_gpt/spqr/quantizeargs.py b/run_gpt/spqr/quantizeargs.py
new file mode 100644
index 0000000..d628210
--- /dev/null
+++ b/run_gpt/spqr/quantizeargs.py
@@ -0,0 +1,60 @@
+class QuantizeArgs:
+ def __init__(
+ self,
+ model_name,
+ model_path,
+ dataset: str = 'wikitext2',
+ load_from_saved: str = None,
+ seed: int = 0,
+ nsamples: int = 128,
+ percdamp: float = 0.01,
+ wbits: int = 4,
+ groupsize: int = 16,
+ permutation_order: str = "identity",
+ true_sequential: bool = False,
+ new_eval: bool = False,
+ sym: bool = False,
+ perchannel: bool = True,
+ qq_scale_bits: int = 3,
+ round_zero: int = None,
+ qq_zero_bits: int = 3,
+ qq_zero_sym: bool = False,
+ qq_groupsize: int = 16,
+ outlier_threshold: float = 0.2,
+ simplified_outliers: bool = False,
+ save: str = '',
+ save_safetensors: str = '',
+ benchmark: int = 0,
+ check: bool = False,
+ skip_out_loss: bool = False,
+ offload_activations: bool = False,
+ dtype: str = "auto",
+ ):
+ self.model_name = model_name
+ self.model_path = model_path
+ self.dataset = dataset
+ self.load_from_saved = load_from_saved
+ self.seed = seed
+ self.nsamples = nsamples
+ self.percdamp = percdamp
+ self.wbits = wbits
+ self.groupsize = groupsize
+ self.permutation_order = permutation_order
+ self.true_sequential = true_sequential
+ self.new_eval = new_eval
+ self.sym = sym
+ self.perchannel = perchannel
+ self.qq_scale_bits = qq_scale_bits
+ self.round_zero = round_zero
+ self.qq_zero_bits = qq_zero_bits
+ self.qq_zero_sym = qq_zero_sym
+ self.qq_groupsize = qq_groupsize
+ self.outlier_threshold = outlier_threshold
+ self.simplified_outliers = simplified_outliers
+ self.save = save
+ self.save_safetensors = save_safetensors
+ self.benchmark = benchmark
+ self.check = check
+ self.skip_out_loss = skip_out_loss
+ self.offload_activations = offload_activations
+ self.dtype = dtype
diff --git a/open_gpt/spqr/quantutils.py b/run_gpt/spqr/quantutils.py
similarity index 82%
rename from open_gpt/spqr/quantutils.py
rename to run_gpt/spqr/quantutils.py
index 1c714c6..4113994 100644
--- a/open_gpt/spqr/quantutils.py
+++ b/run_gpt/spqr/quantutils.py
@@ -1,10 +1,11 @@
# Adapted from https://github.com/Vahe1994/SpQR
import time
+
import torch
import torch.nn as nn
+from spqr_engine import SPQRUtil
from tqdm import trange
from transformers import LlamaForCausalLM, LlamaTokenizer
-from spqr_engine import SPQRUtil
def save_llama(model_name, model, save_directory):
@@ -15,10 +16,15 @@ def save_llama(model_name, model, save_directory):
def get_llama(model_path):
import torch
+
def skip(*args, **kwargs):
pass
- saved_inits = torch.nn.init.kaiming_uniform_, torch.nn.init.uniform_, torch.nn.init.normal_ # preserving
+ saved_inits = (
+ torch.nn.init.kaiming_uniform_,
+ torch.nn.init.uniform_,
+ torch.nn.init.normal_,
+ ) # preserving
torch.nn.init.kaiming_uniform_ = skip
torch.nn.init.uniform_ = skip
torch.nn.init.normal_ = skip
@@ -27,11 +33,15 @@ def skip(*args, **kwargs):
pretrained_model_name_or_path=model_path,
local_files_only=True,
low_cpu_mem_usage=True,
- torch_dtype="auto"
+ torch_dtype="auto",
)
model.seqlen = 2048
- torch.nn.init.kaiming_uniform_, torch.nn.init.uniform_, torch.nn.init.normal_ = saved_inits # restoring
+ (
+ torch.nn.init.kaiming_uniform_,
+ torch.nn.init.uniform_,
+ torch.nn.init.normal_,
+ ) = saved_inits # restoring
return model
@@ -40,20 +50,24 @@ def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=""):
return {name: module}
res = {}
for name1, child in module.named_children():
- res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+ res.update(
+ find_layers(
+ child, layers=layers, name=name + "." + name1 if name != "" else name1
+ )
+ )
return res
def get_average_number_of_bits(
- wbits: int = 3,
- qq_scale_bits: int = 3,
- qq_zero_bits: int = 3,
- qqq_scale_bits: int = 16,
- qqq_zero_bits: int = 16,
- groupsize: int = 16,
- qq_groupsize: int = 16,
- round_zero: bool = False,
- global_ol_n_share: float = 0.00,
+ wbits: int = 3,
+ qq_scale_bits: int = 3,
+ qq_zero_bits: int = 3,
+ qqq_scale_bits: int = 16,
+ qqq_zero_bits: int = 16,
+ groupsize: int = 16,
+ qq_groupsize: int = 16,
+ round_zero: bool = False,
+ global_ol_n_share: float = 0.00,
):
# if not quantized stats are in full precision
qq_scale_bits = qq_scale_bits or 16
@@ -62,11 +76,17 @@ def get_average_number_of_bits(
qq_groupsize = qq_groupsize or float('inf')
if round_zero:
- wbits_avg = wbits + (qq_scale_bits + wbits) / groupsize + (qqq_scale_bits + qqq_zero_bits) / (
- groupsize * qq_groupsize)
+ wbits_avg = (
+ wbits
+ + (qq_scale_bits + wbits) / groupsize
+ + (qqq_scale_bits + qqq_zero_bits) / (groupsize * qq_groupsize)
+ )
else:
- wbits_avg = wbits + (qq_scale_bits + qq_zero_bits) / groupsize + 2 * (qqq_scale_bits + qqq_zero_bits) / (
- groupsize * qq_groupsize)
+ wbits_avg = (
+ wbits
+ + (qq_scale_bits + qq_zero_bits) / groupsize
+ + 2 * (qqq_scale_bits + qqq_zero_bits) / (groupsize * qq_groupsize)
+ )
# correct accounting for outliers
if global_ol_n_share > 0:
@@ -88,7 +108,9 @@ def llama_sequential(model, dataloader, args, dev):
layers[0] = layers[0].to(dev)
dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros((args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+ inps = torch.zeros(
+ (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+ )
cache = {"i": 0, "attention_mask": None}
class Catcher(nn.Module):
@@ -128,7 +150,9 @@ def forward(self, inp, **kwargs):
normal_outlier_count_global, w_count_global = 0, 0
for i in range(len(layers)):
- print(f"\n------------------------------------------------------------------\nStarting layer {i}")
+ print(
+ f"\n------------------------------------------------------------------\nStarting layer {i}"
+ )
normal_outlier_count, w_count = 0, 0
stats_payload = {}
@@ -185,14 +209,19 @@ def tmp(_, inp, out):
simplified_outliers=args.simplified_outliers,
)
- gptq[name].layer.weight.data = quantized.weight.to(gptq[name].layer.weight.data.dtype)
+ gptq[name].layer.weight.data = quantized.weight.to(
+ gptq[name].layer.weight.data.dtype
+ )
quantizers["model.layers.%d.%s" % (i, name)] = () # to be updated
# OUTLIER STATS per module:
- normal_outliers_count = quantized.unstructured_outlier_mask.to(torch.int32).sum()
+ normal_outliers_count = quantized.unstructured_outlier_mask.to(
+ torch.int32
+ ).sum()
- stats_payload[f"n_{name}_ol_share"] = round((normal_outliers_count / quantized.weight.numel()).item(),
- 6)
+ stats_payload[f"n_{name}_ol_share"] = round(
+ (normal_outliers_count / quantized.weight.numel()).item(), 6
+ )
normal_outlier_count += normal_outliers_count.item()
w_count += quantized.weight.numel()
@@ -246,7 +275,7 @@ def tmp(_, inp, out):
args.groupsize,
args.qq_groupsize,
args.round_zero,
- normal_outlier_count_global / w_count_global
+ normal_outlier_count_global / w_count_global,
)
model.config.use_cache = use_cache
@@ -268,7 +297,9 @@ def llama_eval(model, testenc, dev):
layers[0] = layers[0].to(dev)
dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+ inps = torch.zeros(
+ (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+ )
cache = {"i": 0, "attention_mask": None}
class Catcher(nn.Module):
@@ -286,7 +317,7 @@ def forward(self, inp, **kwargs):
saved_num_threads = torch.get_num_threads()
torch.set_num_threads(min(16, saved_num_threads))
for i in range(nsamples):
- batch = testenc[:, (i * model.seqlen): ((i + 1) * model.seqlen)].to(dev)
+ batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev)
try:
model(batch)
except ValueError:
@@ -324,9 +355,11 @@ def forward(self, inp, **kwargs):
hidden_states = model.model.norm(hidden_states)
lm_logits = model.lm_head(hidden_states)
shift_logits = lm_logits[:, :-1, :].contiguous()
- shift_labels = testenc[:, (i * model.seqlen): ((i + 1) * model.seqlen)][:, 1:]
+ shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
loss_fct = nn.CrossEntropyLoss()
- loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+ loss = loss_fct(
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+ )
neg_log_likelihood = loss.float() * model.seqlen
nlls.append(neg_log_likelihood)
ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
diff --git a/open_gpt/spqr/spqr_engine.py b/run_gpt/spqr/spqr_engine.py
similarity index 71%
rename from open_gpt/spqr/spqr_engine.py
rename to run_gpt/spqr/spqr_engine.py
index 5ec99b8..a310795 100644
--- a/open_gpt/spqr/spqr_engine.py
+++ b/run_gpt/spqr/spqr_engine.py
@@ -1,11 +1,13 @@
# Originally from https://github.com/Vahe1994/SpQR
from __future__ import annotations
+
import math
-from typing import Optional, NamedTuple, Union
+from typing import NamedTuple, Optional, Union
+
import torch
+from quant_groups import Quantizer, quantize
from tqdm.auto import tqdm
from weight_permutation import get_permutation_order
-from quant_groups import Quantizer, quantize
class SPQRUtil:
@@ -19,7 +21,9 @@ def __init__(self, layer):
self.nsamples = 0
def add_batch(self, inp):
- assert self.H is not None, "Already ran quantization; cannot add more data batches"
+ assert (
+ self.H is not None
+ ), "Already ran quantization; cannot add more data batches"
if len(inp.shape) == 2:
inp = inp.unsqueeze(0)
tmp = inp.shape[0]
@@ -82,7 +86,9 @@ def quantize(
self.H = None
H = H[perm][:, perm]
- self.dead = torch.diag(H) == 0 # indices of input features that do not affect outputs
+ self.dead = (
+ torch.diag(H) == 0
+ ) # indices of input features that do not affect outputs
if percdamp > 0:
ix = torch.arange(len(H), device=weight.device)
H[ix, ix] += percdamp * abs(torch.diag(H)).mean()
@@ -96,7 +102,9 @@ def quantize(
quantizer = Quantizer()
quantizer.configure(bits, perchannel=perchannel, sym=sym, **kwargs)
- assert H_inv_cho.shape[0] == H_inv_cho.shape[1] == weight.shape[1], "weight must be [out_features, in_features]"
+ assert (
+ H_inv_cho.shape[0] == H_inv_cho.shape[1] == weight.shape[1]
+ ), "weight must be [out_features, in_features]"
out_dim, in_dim = weight.shape # [out_features, in_features]
if groupsize is None:
@@ -106,15 +114,21 @@ def quantize(
outlier_column_indices = torch.empty(0, dtype=torch.int64, device=weight.device)
del H_inv
- outlier_scale = (weight.var(dim=0) / torch.diag(H_inv_cho).square()).mean().item()
+ outlier_scale = (
+ (weight.var(dim=0) / torch.diag(H_inv_cho).square()).mean().item()
+ )
unstructured_outlier_threshold = outlier_relative_threshold * outlier_scale
- in_group_index = -1 # index of current group of input features, for group quantizer purposes
+ in_group_index = (
+ -1
+ ) # index of current group of input features, for group quantizer purposes
quantization_errors = torch.zeros_like(weight)
unstructured_outlier_mask = torch.zeros_like(weight, dtype=torch.bool)
block_start_iter = range(0, in_dim - keep_last_columns, blocksize)
- block_start_iter = tqdm(block_start_iter, leave=False) if verbose else block_start_iter
+ block_start_iter = (
+ tqdm(block_start_iter, leave=False) if verbose else block_start_iter
+ )
for block_start in block_start_iter:
block_end = min(block_start + blocksize, in_dim)
for column_index in range(block_start, block_end):
@@ -123,57 +137,90 @@ def quantize(
in_group_index += 1
group_weight = weight[:, column_index : column_index + groupsize]
- if simplified_outliers or (unstructured_outlier_threshold == float("inf")):
+ if simplified_outliers or (
+ unstructured_outlier_threshold == float("inf")
+ ):
quantizer.find_params(group_weight, weight=True)
else:
# objective: detect which weights will be designated as outliers, fit quantizer *without* these weights
# step 1: fit quantizer on a leave-one-out version of weights, i.e. in each group, drop one weight at a time
- assert perchannel, "refitting quantizer is only implemented for perchannel=True"
- group_diag_hessian_inv_cho = H_inv_cho_diag[column_index : column_index + groupsize]
+ assert (
+ perchannel
+ ), "refitting quantizer is only implemented for perchannel=True"
+ group_diag_hessian_inv_cho = H_inv_cho_diag[
+ column_index : column_index + groupsize
+ ]
loo_quantization_error_sq = get_leave_one_out_error(
group_weight, group_diag_hessian_inv_cho, bits=bits, sym=sym
)
# ^-- dequantized(quantized(group_weight)) using a quantizer trained on all weights except the reconstructed one
- likely_unstructured_outlier_mask = (loo_quantization_error_sq > unstructured_outlier_threshold).float()
+ likely_unstructured_outlier_mask = (
+ loo_quantization_error_sq > unstructured_outlier_threshold
+ ).float()
non_outlier_mask = 1 - likely_unstructured_outlier_mask
- mean_over_non_outliers = torch.sum(group_weight * non_outlier_mask, dim=1, keepdim=True) / torch.sum(
- non_outlier_mask, dim=1, keepdim=True
- ).clamp_min(1)
- group_weight_without_outliers = group_weight * non_outlier_mask + mean_over_non_outliers * (
- 1 - non_outlier_mask
+ mean_over_non_outliers = torch.sum(
+ group_weight * non_outlier_mask, dim=1, keepdim=True
+ ) / torch.sum(non_outlier_mask, dim=1, keepdim=True).clamp_min(
+ 1
+ )
+ group_weight_without_outliers = (
+ group_weight * non_outlier_mask
+ + mean_over_non_outliers * (1 - non_outlier_mask)
+ )
+ quantizer.find_params(
+ group_weight_without_outliers, weight=True
)
- quantizer.find_params(group_weight_without_outliers, weight=True)
del group_diag_hessian_inv_cho, loo_quantization_error_sq
- del mean_over_non_outliers, group_weight_without_outliers, non_outlier_mask
+ del (
+ mean_over_non_outliers,
+ group_weight_without_outliers,
+ non_outlier_mask,
+ )
del group_weight
weight_i_quantized = quantize(
- weight[:, column_index].unsqueeze(1), quantizer.scale, quantizer.zero, quantizer.maxq
+ weight[:, column_index].unsqueeze(1),
+ quantizer.scale,
+ quantizer.zero,
+ quantizer.maxq,
).reshape_as(weight[:, column_index])
- delta_weight_i = weight[:, column_index] - weight_i_quantized # [out_dim]
- quantization_errors[:, column_index] = delta_weight_i / H_inv_cho[column_index, column_index] # [out_dim]
+ delta_weight_i = (
+ weight[:, column_index] - weight_i_quantized
+ ) # [out_dim]
+ quantization_errors[:, column_index] = (
+ delta_weight_i / H_inv_cho[column_index, column_index]
+ ) # [out_dim]
if unstructured_outlier_threshold != float("inf"):
unstructured_outlier_mask[:, column_index] = (
- quantization_errors[:, column_index].square() > unstructured_outlier_threshold
+ quantization_errors[:, column_index].square()
+ > unstructured_outlier_threshold
)
# re-quantize without outliers
is_outlier = unstructured_outlier_mask[:, column_index].float()
weight_i_quantized_wo_outliers = quantize(
- (weight[:, column_index] * (1 - is_outlier)).unsqueeze(1), quantizer.scale, quantizer.zero, quantizer.maxq
+ (weight[:, column_index] * (1 - is_outlier)).unsqueeze(1),
+ quantizer.scale,
+ quantizer.zero,
+ quantizer.maxq,
).reshape_as(weight[:, column_index])
weight_i_quantized = (
- weight_i_quantized_wo_outliers * (1 - is_outlier) + weight[:, column_index] * is_outlier
+ weight_i_quantized_wo_outliers * (1 - is_outlier)
+ + weight[:, column_index] * is_outlier
) # [out_dim]
del weight_i_quantized_wo_outliers
- delta_weight_i = weight[:, column_index] - weight_i_quantized # [out_dim]
- quantization_errors[:, column_index] = delta_weight_i / H_inv_cho[column_index, column_index] # [out_dim]
+ delta_weight_i = (
+ weight[:, column_index] - weight_i_quantized
+ ) # [out_dim]
+ quantization_errors[:, column_index] = (
+ delta_weight_i / H_inv_cho[column_index, column_index]
+ ) # [out_dim]
weight[:, column_index] = weight_i_quantized
weight[:, column_index + 1 : block_end].addr_(
@@ -205,7 +252,9 @@ class QuantizationResult(NamedTuple):
"""A collection of codebooks, indices and assorted statistics produced by SPQRUtil; not memory-optimized!"""
weight: torch.FloatTensor # dequantized(quantized(weight)), same shape as the original
- perm: Optional[torch.LongTensor] # optional input permutation indices that were used during quantization
+ perm: Optional[
+ torch.LongTensor
+ ] # optional input permutation indices that were used during quantization
# NOTE: if permutation_order != identity, all subsequent tensors (incl. outlier indices) are permuted in that order!
quantization_errors: torch.Tensor # per-element quantization errors, defined as (weight - quantized_weight) / diag(inverse_hessian_cholesky)
@@ -213,27 +262,46 @@ class QuantizationResult(NamedTuple):
unstructured_outlier_mask: torch.Tensor # bool mask where True means that this is an individual outlier
-def get_leave_one_out_error(group_weight: torch.Tensor, group_diag_hessian_inv_cho: torch.Tensor, *, bits, sym):
+def get_leave_one_out_error(
+ group_weight: torch.Tensor, group_diag_hessian_inv_cho: torch.Tensor, *, bits, sym
+):
"""EXPERIMENTAL! BEWARE - for each weight, fit quantizer without this_one_weight and return this one weight's reconstruction"""
assert group_weight.ndim == 2
loo_indices = torch.arange(group_weight.shape[1], device=group_weight.device)
- loo_indices = loo_indices[1:] - (loo_indices[:, None] >= loo_indices[1:]).to(loo_indices.dtype)
- groupwise_loo_data = group_weight[:, loo_indices] # [num_groups, num_loo = groupsize, groupsize - 1]
+ loo_indices = loo_indices[1:] - (loo_indices[:, None] >= loo_indices[1:]).to(
+ loo_indices.dtype
+ )
+ groupwise_loo_data = group_weight[
+ :, loo_indices
+ ] # [num_groups, num_loo = groupsize, groupsize - 1]
fast_quantizer = Quantizer(shape=groupwise_loo_data.flatten(0, 1).shape)
fast_quantizer.configure(bits, perchannel=True, sym=sym)
fast_quantizer.find_params(groupwise_loo_data.flatten(0, 1), weight=True)
# compute error improvement from not quantizing each one weight
# to do so, we shall first train quantizer on leave-one-out data (which can be done faster since not all data affects quantization)
- loo_groupwise_reconstructed_weights = fast_quantizer.quantize(groupwise_loo_data.flatten(0, 1)).reshape_as(groupwise_loo_data)
- loo_group_diag_hessian_inv_cho = group_diag_hessian_inv_cho[loo_indices] # [num_loo = groupsize, groupsize - 1]
+ loo_groupwise_reconstructed_weights = fast_quantizer.quantize(
+ groupwise_loo_data.flatten(0, 1)
+ ).reshape_as(groupwise_loo_data)
+ loo_group_diag_hessian_inv_cho = group_diag_hessian_inv_cho[
+ loo_indices
+ ] # [num_loo = groupsize, groupsize - 1]
assert group_diag_hessian_inv_cho.ndim == 1
# total quantization error consists of hessian-weighted mse on all remaining weights except for the one that's left out
# -- this is because the left-out weights will not be quantized, and therefore, has zero quantization error
- loo_errors_sq = ((loo_groupwise_reconstructed_weights - groupwise_loo_data) / loo_group_diag_hessian_inv_cho).square().sum(-1)
- assert loo_errors_sq.shape == group_weight.shape # [num_groups, num_loo = groupsize]
+ loo_errors_sq = (
+ (
+ (loo_groupwise_reconstructed_weights - groupwise_loo_data)
+ / loo_group_diag_hessian_inv_cho
+ )
+ .square()
+ .sum(-1)
+ )
+ assert (
+ loo_errors_sq.shape == group_weight.shape
+ ) # [num_groups, num_loo = groupsize]
# as a baseline error, quantize data normally without outliers
base_quantizer = Quantizer(shape=group_weight.shape)
@@ -241,7 +309,9 @@ def get_leave_one_out_error(group_weight: torch.Tensor, group_diag_hessian_inv_c
base_quantizer.find_params(group_weight, weight=True)
baseline_reconstructed_weights = base_quantizer.quantize(group_weight)
baseline_errors_sq = (
- ((baseline_reconstructed_weights - group_weight) / group_diag_hessian_inv_cho).square().sum(dim=1, keepdim=True)
+ ((baseline_reconstructed_weights - group_weight) / group_diag_hessian_inv_cho)
+ .square()
+ .sum(dim=1, keepdim=True)
)
# outlier's usefulness = how much does mse decrease from treating this weight as an outlier
diff --git a/open_gpt/spqr/test.py b/run_gpt/spqr/test.py
similarity index 100%
rename from open_gpt/spqr/test.py
rename to run_gpt/spqr/test.py
diff --git a/open_gpt/spqr/weight_permutation.py b/run_gpt/spqr/weight_permutation.py
similarity index 93%
rename from open_gpt/spqr/weight_permutation.py
rename to run_gpt/spqr/weight_permutation.py
index 583c6b2..7fc0961 100644
--- a/open_gpt/spqr/weight_permutation.py
+++ b/run_gpt/spqr/weight_permutation.py
@@ -25,7 +25,12 @@ def find_greedy_nearest_indices(weight: torch.Tensor, use_abs: bool = False):
return permutation
-def get_permutation_order(H: torch.Tensor, W: torch.Tensor, permutation_order: str = "identity", use_abs: bool = False):
+def get_permutation_order(
+ H: torch.Tensor,
+ W: torch.Tensor,
+ permutation_order: str = "identity",
+ use_abs: bool = False,
+):
"""
Permutation order for layer weights.
:param H: Hessian of Weights
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 8df172f..0beaafe 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -1,5 +1,5 @@
-import open_gpt
-from open_gpt.profile import LLMMeasure, end_measure, log_measures, start_measure
+import run_gpt
+from run_gpt.profile import LLMMeasure, end_measure, log_measures, start_measure
PROMPT = 'The goal of life is'
@@ -21,11 +21,11 @@ def main(args):
print(f"===> start model loading ...")
model_load_start = start_measure()
if args.precision == 'fp16':
- model = open_gpt.create_model(
+ model = run_gpt.create_model(
args.model_name, precision='fp16', device_map=args.device_map
)
else:
- model = open_gpt.create_model(
+ model = run_gpt.create_model(
args.model_name,
precision=args.precision,
adapter_name_or_path=args.adapter_name,
@@ -54,7 +54,7 @@ def main(args):
if __name__ == '__main__':
import argparse
- parser = argparse.ArgumentParser(description='Benchmark for open_gpt.')
+ parser = argparse.ArgumentParser(description='Benchmark for run_gpt.')
parser.add_argument(
'--model-name',
type=str,
diff --git a/setup.py b/setup.py
index 27b9551..6c5721e 100644
--- a/setup.py
+++ b/setup.py
@@ -4,11 +4,11 @@
from setuptools import find_packages, setup
if sys.version_info < (3, 8, 0):
- raise OSError(f'OpenGPT requires Python >=3.8, but yours is {sys.version}')
+ raise OSError(f'RunGPT requires Python >=3.8, but yours is {sys.version}')
try:
- pkg_name = 'open-gpt-torch'
- libinfo_py = path.join(path.dirname(__file__), 'open_gpt', '__init__.py')
+ pkg_name = 'run-gpt-torch'
+ libinfo_py = path.join(path.dirname(__file__), 'run_gpt', '__init__.py')
libinfo_content = open(libinfo_py, 'r', encoding='utf8').readlines()
version_line = [l.strip() for l in libinfo_content if l.startswith('__version__')][
0
@@ -33,8 +33,8 @@
author='Jina AI',
author_email='hello@jina.ai',
license='Apache 2.0',
- url='https://https://github.com/jina-ai/opengpt',
- download_url='https://https://github.com/jina-ai/opengpt/tags',
+ url='https://https://github.com/jina-ai/rungpt',
+ download_url='https://https://github.com/jina-ai/rungpt/tags',
long_description=_long_description,
long_description_content_type='text/markdown',
zip_safe=False,
@@ -62,7 +62,7 @@
},
entry_points={
'console_scripts': [
- 'opengpt = open_gpt.cli.application:main',
+ 'rungpt = run_gpt.cli.application:main',
],
},
classifiers=[
@@ -84,9 +84,9 @@
"Topic :: Software Development :: Libraries :: Python Modules",
],
project_urls={
- 'Documentation': 'https://opengpt.jina.ai',
- 'Source': 'https://github.com/jina-ai/opengpt/',
- 'Tracker': 'https://github.com/jina-ai/opengpt/issues',
+ 'Documentation': 'https://rungpt.jina.ai',
+ 'Source': 'https://github.com/jina-ai/rungpt/',
+ 'Tracker': 'https://github.com/jina-ai/rungpt/issues',
},
keywords=[
"jina",