diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 7e85f63..8d2b34c 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -34,7 +34,7 @@ jobs: echo "BUILD_DATE=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_ENV echo "BUILD_TARGET=clip_executor" >> $GITHUB_ENV - VERSION=$(sed -n '/^__version__ = /p' ./open_gpt/__init__.py | cut -d \' -f2) + VERSION=$(sed -n '/^__version__ = /p' ./rungpt/__init__.py | cut -d \' -f2) V_VERSION=v${VERSION} MINOR_VERSION=${VERSION%.*} MAJOR_VERSION=${MINOR_VERSION%.*} @@ -42,21 +42,21 @@ jobs: if [[ "${{ github.event.inputs.triggered_by }}" == "CD" ]]; then # on every CD release echo "TAG_ALIAS=\ - jinaai/open_gpt:master" \ + jinaai/rungpt:master" \ >> $GITHUB_ENV elif [[ "${{ github.event.inputs.triggered_by }}" == "TAG" ]]; then # on every tag release echo "TAG_ALIAS=\ - jinaai/open_gpt:latest, \ - jinaai/open_gpt:v${VERSION}, \ - jinaai/open_gpt:v${MINOR_VERSION} \ + jinaai/rungpt:latest, \ + jinaai/rungpt:v${VERSION}, \ + jinaai/rungpt:v${MINOR_VERSION} \ " >> $GITHUB_ENV elif [[ "${{ github.event.inputs.triggered_by }}" == "MANUAL" ]]; then # on every manual release echo "TAG_ALIAS=\ - jinaai/open_gpt:v${VERSION} \ + jinaai/rungpt:v${VERSION} \ " >> $GITHUB_ENV else echo "Bad triggered_by: ${{ github.event.inputs.triggered_by }}!" @@ -86,7 +86,7 @@ jobs: with: file: Dockerfiles/Dockerfile platforms: linux/amd64 - cache-from: type=registry,ref=jinaai/open_gpt:latest + cache-from: type=registry,ref=jinaai/rungpt:latest cache-to: type=inline push: true tags: ${{env.TAG_ALIAS}} @@ -101,10 +101,10 @@ jobs: with: file: Dockerfiles/gateway.Dockerfile platforms: linux/amd64 - cache-from: type=registry,ref=jinaai/open_gpt_gateway:latest + cache-from: type=registry,ref=jinaai/run_gpt_gateway:latest cache-to: type=inline push: true - tags: jinaai/open_gpt_gateway:v${{env.VERSION}}, jinaai/open_gpt_gateway:latest + tags: jinaai/run_gpt_gateway:v${{env.VERSION}}, jinaai/run_gpt_gateway:latest build-args: | BUILD_DATE=${{env.BUILD_DATE}} VERSION=${{env.VERSION}} @@ -116,10 +116,10 @@ jobs: with: file: Dockerfiles/executor.Dockerfile platforms: linux/amd64 - cache-from: type=registry,ref=jinaai/open_gpt_executor:latest + cache-from: type=registry,ref=jinaai/run_gpt_executor:latest cache-to: type=inline push: true - tags: jinaai/open_gpt_executor:v${{env.VERSION}}, jinaai/open_gpt_executor:latest + tags: jinaai/run_gpt_executor:v${{env.VERSION}}, jinaai/run_gpt_executor:latest build-args: | BUILD_DATE=${{env.BUILD_DATE}} VERSION=${{env.VERSION}} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 7169896..9cbbb2a 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -9,12 +9,12 @@ jobs: steps: - uses: actions/checkout@v3 with: - path: opengpt + path: rungpt - uses: actions/checkout@v3 with: - repository: numb3r3/opengpt.github.io.git + repository: numb3r3/rungpt.github.io.git ref: 'main' - path: ./opengpt.github.io + path: ./rungpt.github.io token: ${{ secrets.GH_TEST_TOKEN }} - uses: actions/setup-python@v2 with: @@ -23,5 +23,5 @@ jobs: pip install pillow cairosvg sudo apt-get install -y libcairo2-dev libfreetype6-dev libffi-dev libjpeg-dev libpng-dev libz-dev pip install mkdocs-material mkdocs-material-extensions mkdocs-redirects --upgrade - mkdocs gh-deploy --config-file ../opengpt/mkdocs.yml --force - working-directory: ./opengpt.github.io \ No newline at end of file + mkdocs gh-deploy --config-file ../rungpt/mkdocs.yml --force + working-directory: ./rungpt.github.io \ No newline at end of file diff --git a/.github/workflows/force-release.yml b/.github/workflows/force-release.yml index 02a7965..e7c1434 100644 --- a/.github/workflows/force-release.yml +++ b/.github/workflows/force-release.yml @@ -42,7 +42,7 @@ jobs: echo "VCS_REF=$VCS_REF" >> $GITHUB_ENV echo "Will build $VCS_REF" - VERSION=$(sed -n '/^__version__ = /p' ./open_gpt/__init__.py | cut -d \' -f2) + VERSION=$(sed -n '/^__version__ = /p' ./run_gpt/__init__.py | cut -d \' -f2) echo "VERSION=$VERSION" >> $GITHUB_ENV diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 77395c0..3ecb8c8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -28,7 +28,7 @@ jobs: echo "VCS_REF=$VCS_REF" >> $GITHUB_ENV echo "Will build $VCS_REF" - VERSION=$(sed -n '/^__version__ = /p' ./open_gpt/__init__.py | cut -d \' -f2) + VERSION=$(sed -n '/^__version__ = /p' ./run_gpt/__init__.py | cut -d \' -f2) echo "VERSION=$VERSION" >> $GITHUB_ENV diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e81ec59..e69a066 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 4.0.1 hooks: - id: flake8 - exclude: ^(.git|__pycache__|docs/source/conf.py|old|build|dist|tests|open_gpt/resources/) + exclude: ^(.git|__pycache__|docs/source/conf.py|old|build|dist|tests|run_gpt/resources/) args: - --max-complexity=10 - --max-line-length=127 @@ -12,8 +12,8 @@ repos: # rev: v1.5.8 # hooks: # - id: darglint -# files: open_gpt/ -# exclude: ^(docs/|open_gpt/resources/) +# files: run_gpt/ +# exclude: ^(docs/|run_gpt/resources/) # args: # - --message-template={path}:{line} {msg_id} {msg} # - -s=sphinx @@ -24,7 +24,7 @@ repos: hooks: - id: pydocstyle files: client/ - exclude: ^(docs/|open_gpt/resources/) + exclude: ^(docs/|run_gpt/resources/) args: - --select=D101,D102,D103 - repo: https://github.com/ambv/black @@ -32,7 +32,7 @@ repos: hooks: - id: black types: [python] - exclude: ^(docs/|open_gpt/resources/) + exclude: ^(docs/|run_gpt/resources/) args: - -S - repo: https://github.com/asottile/blacken-docs diff --git a/Dockerfiles/Dockerfile b/Dockerfiles/Dockerfile index 9a2aa2a..9c53c0f 100644 --- a/Dockerfiles/Dockerfile +++ b/Dockerfiles/Dockerfile @@ -8,9 +8,9 @@ FROM mosaicml/pytorch:${TORCH_VERSION}_cu${CUDA_VERSION}-python3.10-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 # copy will almost always invalid the cache -COPY . /open_gpt/ -WORKDIR /open_gpt +COPY . /run_gpt/ +WORKDIR /run_gpt RUN python3 -m pip install -e . -ENTRYPOINT ["opengpt"] \ No newline at end of file +ENTRYPOINT ["rungpt"] \ No newline at end of file diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md index e6e9f64..92bb3f1 100644 --- a/MODEL_ZOO.md +++ b/MODEL_ZOO.md @@ -1,6 +1,6 @@ -# Model Zoo in OpenGPT +# Model Zoo in RunGPT -OpenGPT supports the following models out of the box: +RunGPT supports the following models out of the box: - LLM (Large Language Model) diff --git a/README.md b/README.md index 2cd75e4..515cdea 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -# ☄️ OpenGPT +# ☄️ RunGPT

-OpenGPT: An open-source cloud-native large-scale multimodal model serving framework +rungpt: An open-source cloud-native large-scale multimodal model serving framework

@@ -11,10 +11,10 @@ ![](https://img.shields.io/badge/Made%20with-JinaAI-blueviolet?style=flat) -[![PyPI](https://img.shields.io/pypi/v/open_gpt_torch)](https://pypi.org/project/open_gpt_torch/) -[![PyPI - License](https://img.shields.io/pypi/l/open_gpt_torch)](https://pypi.org/project/open_gpt_torch/) +[![PyPI](https://img.shields.io/pypi/v/run_gpt_torch)](https://pypi.org/project/run_gpt_torch/) +[![PyPI - License](https://img.shields.io/pypi/l/run_gpt_torch)](https://pypi.org/project/run_gpt_torch/) -**OpenGPT** is an open-source _cloud-native_ large-scale **_multimodal models_** (LMMs) serving framework. +**RunGPT** is an open-source _cloud-native_ large-scale **_multimodal models_** (LMMs) serving framework. It is designed to simplify the deployment and management of large language models, on a distributed cluster of GPUs. We aim to make it a one-stop solution for a centralized and accessible place to gather techniques for optimizing large-scale multimodal models and make them easy to use for everyone. @@ -30,7 +30,7 @@ We aim to make it a one-stop solution for a centralized and accessible place to ## Features -OpenGPT provides the following features to make it easy to deploy and serve **large multi-modal models** (LMMs) at scale: +RunGPT provides the following features to make it easy to deploy and serve **large multi-modal models** (LMMs) at scale: - Support for multi-modal models on top of large language models - Scalable architecture for handling high traffic loads @@ -41,13 +41,13 @@ OpenGPT provides the following features to make it easy to deploy and serve **la ## Updates -- **2023-05-12**: 🎉We have released the first version `v0.0.1` of OpenGPT. You can install it with `pip install open_gpt_torch`. +- **2023-05-12**: 🎉We have released the first version `v0.0.1` of RunGPT. You can install it with `pip install run_gpt_torch`. ## Supported Models
-OpenGPT supports the following models out of the box: +RunGPT supports the following models out of the box: - LLM (Large Language Model) @@ -69,7 +69,7 @@ For more details about the supported models, please see the [Model Zoo](./MODEL_ ## Roadmap -You can view our roadmap with features that are planned, started, and completed on the [Roadmap discussion](https://github.com/jina-ai/opengpt/discussions/categories/roadmap) category. +You can view our roadmap with features that are planned, started, and completed on the [Roadmap discussion](https://github.com/jina-ai/rungpt/discussions/categories/roadmap) category. ## Get Started @@ -78,15 +78,15 @@ You can view our roadmap with features that are planned, started, and completed Install the package with `pip`: ```bash -pip install open_gpt_torch +pip install run_gpt_torch ``` ### Quickstart ```python -import open_gpt +import run_gpt -model = open_gpt.create_model( +model = run_gpt.create_model( 'stabilityai/stablelm-tuned-alpha-3b', device='cuda', precision='fp16' ) @@ -117,7 +117,7 @@ We use the [stabilityai/stablelm-tuned-alpha-3b](https://huggingface.co/stabilit In most cases of large model serving, the model cannot fit into a single GPU. To solve this problem, we also provide a `device_map` option (supported by `accecleate` package) to automatically partition the model and distribute it across multiple GPUs: ```python -model = open_gpt.create_model( +model = run_gpt.create_model( 'stabilityai/stablelm-tuned-alpha-3b', precision='fp16', device_map='balanced' ) ``` @@ -128,7 +128,7 @@ In the above example, `device_map="balanced"` evenly split the model on all avai > The `device_map` option is supported by the [accelerate](https://github.com/huggingface/accelerate) package. -See [examples on how to use opengpt with different models.](./examples) 🔥 +See [examples on how to use rungpt with different models.](./examples) 🔥 ## Build a model serving in one line @@ -136,16 +136,16 @@ See [examples on how to use opengpt with different models.](./examples) 🔥 To do so, you can use the `serve` command: ```bash -opengpt serve stabilityai/stablelm-tuned-alpha-3b --precision fp16 --device_map balanced +rungpt serve stabilityai/stablelm-tuned-alpha-3b --precision fp16 --device_map balanced ``` -💡 **Tip**: you can inspect the available options with `opengpt serve --help`. +💡 **Tip**: you can inspect the available options with `rungpt serve --help`. This will start a gRPC and HTTP server listening on port `51000` and `52000` respectively. Once the server is ready, as shown below:
Click to expand - +
You can then send requests to the server: @@ -173,7 +173,7 @@ response = requests.post( What's more, we also provide a [Python client](https://github.com/jina-ai/inference-client/) (`inference-client`) for you to easily interact with the server: ```python -from open_gpt import Client +from run_gpt import Client client = Client() @@ -206,7 +206,7 @@ To do so, you can use `deploy` command: using predefined executor ```bash -opengpt deploy stabilityai/stablelm-tuned-alpha-3b --precision fp16 --device_map balanced --cloud jina --replicas 1 +rungpt deploy stabilityai/stablelm-tuned-alpha-3b --precision fp16 --device_map balanced --cloud jina --replicas 1 ``` It will give you a HTTP url and a gRPC url by default: @@ -226,4 +226,4 @@ We welcome contributions from the community! To contribute, please submit a pull ## License -OpenGPT is licensed under the Apache License, Version 2.0. See LICENSE for the full license text. \ No newline at end of file +Rungpt is licensed under the Apache License, Version 2.0. See LICENSE for the full license text. \ No newline at end of file diff --git a/docs/docs/index.md b/docs/docs/index.md index 017005a..63de02d 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -1,18 +1,18 @@ # Quick start -`opengpt` is an open-source _cloud-native_ large-scale **_multimodal models_** (LMMs) serving framework. +`rungpt` is an open-source _cloud-native_ large-scale **_multimodal models_** (LMMs) serving framework. It is designed to simplify the deployment and management of large language models, on a distributed cluster of GPUs. We aim to make it a one-stop solution for a centralized and accessible place to gather techniques for optimizing large-scale multimodal models and make them easy to use for everyone. ## Installation and setup -To use `opengpt`, install it with `pip`: +To use `rungpt`, install it with `pip`:
```shell -$ pip install open_gpt_torch +$ pip install run_gpt_torch ```
@@ -25,9 +25,9 @@ We use the [stabilityai/stablelm-tuned-alpha-3b](https://huggingface.co/stabilit
```python -import open_gpt +import run_gpt -model = open_gpt.create_model( +model = run_gpt.create_model( 'stabilityai/stablelm-tuned-alpha-3b', device='cuda', precision='fp16' ) diff --git a/docs/overrides/home.html b/docs/overrides/home.html index 3467c23..a190a9b 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -23,9 +23,9 @@

Run LLM faster and easier. Use any cloud.

-

opengpt allows you to run your large-scale multi-modal models on any cloud, with a single command.

+

rungpt allows you to run your large-scale multi-modal models on any cloud, with a single command.

-

opengpt is open-source, self-hosted, and supports all major cloud providers, +

rungpt is open-source, self-hosted, and supports all major cloud providers, including AWS, GCP, and Azure.

@@ -158,13 +158,13 @@

Open-source and self-hosted d="M12 1.5A2.5 2.5 0 0 1 14.5 4 2.5 2.5 0 0 1 12 6.5 2.5 2.5 0 0 1 9.5 4 2.5 2.5 0 0 1 12 1.5M15.87 5C18 5 20 7 20 9c2.7 0 2.7 4 0 4H4c-2.7 0-2.7-4 0-4 0-2 2-4 4.13-4 .44 1.73 2.01 3 3.87 3 1.86 0 3.43-1.27 3.87-3M5 15h3l1 7H7l-2-7m5 0h4l-1 7h-2l-1-7m6 0h3l-2 7h-2l1-7Z">

- Getting started with opengpt's open source tool is + Getting started with rungpt's open source tool is just a mater of:

$ - pip install "open_gpt_torch" + pip install "run_gpt_torch"

diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 64b22a1..43720f0 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -1,9 +1,9 @@ {% extends "base.html" %} {% block announce %} -Like opengpt? Give us a ⭐ on + src="{{ 'assets/images/github-logo.png' | url }}"/> GitHub! {% endblock %} diff --git a/examples/chat_example.py b/examples/chat_example.py index cb8b335..b919074 100644 --- a/examples/chat_example.py +++ b/examples/chat_example.py @@ -1,17 +1,21 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure is_step = False start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( 'nthngdy/pythia-owt2-70m-100k', precision='fp16', device_map='balanced' ) -message = [{"role": "user", "content": "What is your name?"}, - {"role": "assistant", "content": "My name is Pythia."}, - {"role": "user", "content": 'Hello Pythia, can you tell me that is the goal of life?'} - ] +message = [ + {"role": "user", "content": "What is your name?"}, + {"role": "assistant", "content": "My name is Pythia."}, + { + "role": "user", + "content": 'Hello Pythia, can you tell me that is the goal of life?', + }, +] if not is_step: generated_text = model.chat( @@ -26,4 +30,4 @@ _['past_key_values'] = None print(_) end_measures = end_measure(start_measures) -log_measures(end_measures, "Model generation") \ No newline at end of file +log_measures(end_measures, "Model generation") diff --git a/examples/codegen_example.py b/examples/codegen_example.py index ec55a12..3f1d885 100644 --- a/examples/codegen_example.py +++ b/examples/codegen_example.py @@ -1,8 +1,8 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( 'Salesforce/codegen-350M-mono', precision='fp16', device_map='balanced' ) diff --git a/examples/context_example.py b/examples/context_example.py index 952ff46..2c1b747 100644 --- a/examples/context_example.py +++ b/examples/context_example.py @@ -1,9 +1,9 @@ -import open_gpt -from open_gpt.models.session import SessionManager +import run_gpt +from run_gpt.models.session import SessionManager session_manager = SessionManager() -model = open_gpt.create_model( +model = run_gpt.create_model( 'decapoda-research/llama-7b-hf', precision='fp16', device_map='balanced' ) diff --git a/examples/flamingo_example.py b/examples/flamingo_example.py index 83bcf12..ec07608 100644 --- a/examples/flamingo_example.py +++ b/examples/flamingo_example.py @@ -4,8 +4,8 @@ import torch from PIL import Image -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure demo_image_one = Image.open( requests.get( @@ -26,7 +26,7 @@ ) start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( 'openflamingo/OpenFlamingo-9B', precision='fp16', device='cuda', device_map=None ) diff --git a/examples/llama_example.py b/examples/llama_example.py index 1c6e7f2..76b53e4 100644 --- a/examples/llama_example.py +++ b/examples/llama_example.py @@ -1,10 +1,10 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure is_step = True # start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( 'decapoda-research/llama-7b-hf', precision='fp16', device_map='balanced' ) diff --git a/examples/lora_example.py b/examples/lora_example.py index 7e91bc1..482cc31 100644 --- a/examples/lora_example.py +++ b/examples/lora_example.py @@ -1,10 +1,10 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure start_measures = start_measure() # llama + LoRA -model = open_gpt.create_model( +model = run_gpt.create_model( 'yahma/llama-7b-hf', precision='fp16', adapter_name_or_path='jinaai/llama-code', diff --git a/examples/moss_example.py b/examples/moss_example.py index cd24905..638fb19 100644 --- a/examples/moss_example.py +++ b/examples/moss_example.py @@ -1,14 +1,14 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( model_name='fnlp/moss-moon-003-sft', precision='fp16', device_map='balanced', ) -# model = open_gpt.create_model( +# model = run_gpt.create_model( # model_name='fnlp/moss-moon-003-sft', # precision='bit8', # device_map='balanced', diff --git a/examples/pythia_example.py b/examples/pythia_example.py index 39526cb..7d06560 100644 --- a/examples/pythia_example.py +++ b/examples/pythia_example.py @@ -1,12 +1,12 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( 'EleutherAI/pythia-12b-deduped', precision='fp16', device_map='balanced' ) -# model = open_gpt.create_model( +# model = run_gpt.create_model( # 'EleutherAI/pythia-12b-deduped', precision='bit4', device_map='balanced' # ) diff --git a/examples/rwkv_example.py b/examples/rwkv_example.py index bffd65d..e039c2a 100644 --- a/examples/rwkv_example.py +++ b/examples/rwkv_example.py @@ -1,10 +1,10 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure prompt = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese." start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( 'ybelkada/rwkv-raven-1b5', precision='fp16', device_map='balanced' diff --git a/examples/stablelm_example.py b/examples/stablelm_example.py index 6f82ac3..38b5fbc 100644 --- a/examples/stablelm_example.py +++ b/examples/stablelm_example.py @@ -1,5 +1,5 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version) - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI. @@ -11,12 +11,12 @@ prompt = f"{system_prompt}<|USER|>What's your mood today?<|ASSISTANT|>" start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( model_name='stabilityai/stablelm-tuned-alpha-7b', precision='fp16', device_map='balanced', ) -# model = open_gpt.create_model( +# model = run_gpt.create_model( # model_name='stabilityai/stablelm-tuned-alpha-7b', # precision='bit4', # device_map='balanced', diff --git a/examples/vicuna_example.py b/examples/vicuna_example.py index a883370..8b53621 100644 --- a/examples/vicuna_example.py +++ b/examples/vicuna_example.py @@ -1,11 +1,11 @@ -import open_gpt -from open_gpt.profile import end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import end_measure, log_measures, start_measure start_measures = start_measure() -model = open_gpt.create_model( +model = run_gpt.create_model( 'lmsys/vicuna-7b-delta-v1.1', precision='fp16', device_map='balanced' ) -# model = open_gpt.create_model( +# model = run_gpt.create_model( # 'CarperAI/stable-vicuna-13b-delta', precision='fp16', device_map='balanced' # ) diff --git a/mkdocs.yml b/mkdocs.yml index d5d339b..0313291 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,13 +1,13 @@ # Project information -site_name: OpenGPT -# site_url: https://opengpt.dev.jina.ai +site_name: RunGPT +# site_url: https://rungpt.dev.jina.ai site_author: jina-ai GmbH site_description: >- Easily and cost-effectively LLM deployments on any cloud. # Repository -repo_url: https://github.com/jina-ai/opengpt -repo_name: jina-ai/opengpt +repo_url: https://github.com/jina-ai/rungpt +repo_name: jina-ai/rungpt edit_uri: edit/master/docs/ #Copyright @@ -111,11 +111,11 @@ extra: # property: G-XXXXXX social: - icon: /fontawesome/brands/github - link: https://github.com/jina-ai/opengpt + link: https://github.com/jina-ai/rungpt - icon: /fontawesome/brands/python - link: https://pypi.org/project/open-gpt-torch + link: https://pypi.org/project/run-gpt-torch # - icon: /fontawesome/brands/docker -# link: https://hub.docker.com/r/jina-ai/opengpt +# link: https://hub.docker.com/r/jina-ai/rungpt - icon: /fontawesome/brands/discord link: https://discord.gg/yaNVaaGSXZ - icon: /fontawesome/brands/twitter @@ -135,8 +135,8 @@ nav: - Kubernetes: docs/deployment/clouds.md - Reference: - CLI: - - opengpt serve: docs/reference/cli/start.md - - opengpt deploy: docs/reference/cli/start.md - - Examples: https://github.com/jina-ai/opengpt/blob/main/examples + - rungpt serve: docs/reference/cli/start.md + - rungpt deploy: docs/reference/cli/start.md + - Examples: https://github.com/jina-ai/rungpt/blob/main/examples - Discord: https://discord.gg/yaNVaaGSXZ - Twitter: https://twitter.com/JinaAI_ diff --git a/open_gpt/cli/commands/about.py b/open_gpt/cli/commands/about.py deleted file mode 100644 index 4eb226b..0000000 --- a/open_gpt/cli/commands/about.py +++ /dev/null @@ -1,24 +0,0 @@ -from cleo.commands.command import Command - - -class AboutCommand(Command): - name = "about" - - description = "Shows information about OpenGPT." - - def handle(self) -> int: - from open_gpt import __version__ - - self.line( - f"""\ -OpenGPT - An open-source cloud-native model serving framework. - -Version: {__version__} - -OpenGPT is a open-source cloud-native model serving framework\ - and libraries. -See https://github.com/jina-ai/opengpt for more information.\ -""" - ) - - return 0 diff --git a/open_gpt/models/chat.py b/open_gpt/models/chat.py deleted file mode 100644 index 83ea877..0000000 --- a/open_gpt/models/chat.py +++ /dev/null @@ -1,80 +0,0 @@ -import torch - -from typing import Optional, List - -MAX_LENGTH = 2048 - - -class ChatMixin: - """Mixin for chat methods.""" - - model: 'AutoModelForCausalLM' - tokenizer: 'AutoTokenizer' - - @torch.inference_mode() - def chat(self, messages: List[dict], - max_new_tokens: Optional[int] = None, - num_beams: int = 1, - do_sample: bool = False, - temperature: float = 1.0, - top_k: int = 1, - top_p: float = 0.9, - repetition_penalty: float = 1.0, - length_penalty: float = 1.0, - no_repeat_ngram_size: int = 0, - echo: bool = False, - **kwargs): - """Generate text from the given prompt. - - :param messages: A list of messages comprising the conversation so far. - :param max_new_tokens: The maximum number of tokens to generate, not including the prompt. - :param num_beams: Number of beams for beam search. 1 means no beam search. - :param do_sample: Whether to use sampling instead of greedy decoding. - :param temperature: The temperature to use for sampling. Only relevant if do_sample is True. Higher means more stochastic. - :param top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Only relevant if do_sample is True. - :param top_p: The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Only relevant if do_sample is True. - :param repetition_penalty: The parameter for repetition penalty. 1.0 means no penalty. - :param length_penalty: Exponential penalty to the length that is used with beam-based generation. - It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. - Since the score is the log likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, - while length_penalty < 0.0 encourages shorter sequences. - :param no_repeat_ngram_size: If set to int > 0, all ngrams of that size can only occur once. - :param echo: Whether to echo the prompt in the generated text. - """ - - # normalize input - prompt = self.create_prompt_for_chat(messages) - completion_response = self.generate(prompt=prompt, max_new_tokens=max_new_tokens, num_beams=num_beams, - do_sample=do_sample, temperature=temperature, - top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, - length_penalty=length_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, echo=echo, **kwargs) - # normalize output - choices = completion_response.pop('choices') - return {'choices': [{'index': 0, 'message': {'role': 'assistant', - 'content': choices[0]['text']}, - 'finish_reason': choices[0]['finish_reason']} - ], - **completion_response} - - @torch.inference_mode() - def step_chat(self, messages: Optional[List[dict]] = None, input_ids: Optional[List[int]] = None, **kwargs): - if messages is None and input_ids is None: - raise ValueError("Either messages or input_ids must be provided.") - if messages and input_ids: - raise ValueError("Only one of messages or input_ids can be provided.") - - if messages: - # normalize input - prompt = self.create_prompt_for_chat(messages) - completion_response = self.step_generate(prompt=prompt, **kwargs) - else: - completion_response = self.step_generate(input_ids=input_ids, **kwargs) - # normalize output - for response in completion_response: - choices = response.pop('choices') - yield {'choices': [{'index': 0, 'message': {'role': 'assistant', - 'content': choices[0]['text']}, - 'finish_reason': choices[0]['finish_reason']} - ], - **response} diff --git a/open_gpt/spqr/quantizeargs.py b/open_gpt/spqr/quantizeargs.py deleted file mode 100644 index 37281d8..0000000 --- a/open_gpt/spqr/quantizeargs.py +++ /dev/null @@ -1,58 +0,0 @@ -class QuantizeArgs(): - def __init__(self, - model_name, - model_path, - dataset: str = 'wikitext2', - load_from_saved: str = None, - seed: int = 0, - nsamples: int = 128, - percdamp: float = 0.01, - wbits: int = 4, - groupsize: int = 16, - permutation_order: str = "identity", - true_sequential: bool = False, - new_eval: bool = False, - sym: bool = False, - perchannel: bool = True, - qq_scale_bits: int = 3, - round_zero: int = None, - qq_zero_bits: int = 3, - qq_zero_sym: bool = False, - qq_groupsize: int = 16, - outlier_threshold: float = 0.2, - simplified_outliers: bool = False, - save: str = '', - save_safetensors: str = '', - benchmark: int = 0, - check: bool = False, - skip_out_loss: bool = False, - offload_activations: bool = False, - dtype: str = "auto"): - self.model_name= model_name - self.model_path = model_path - self.dataset = dataset - self.load_from_saved = load_from_saved - self.seed = seed - self.nsamples = nsamples - self.percdamp = percdamp - self.wbits = wbits - self.groupsize = groupsize - self.permutation_order = permutation_order - self.true_sequential = true_sequential - self.new_eval = new_eval - self.sym = sym - self.perchannel = perchannel - self.qq_scale_bits = qq_scale_bits - self.round_zero = round_zero - self.qq_zero_bits = qq_zero_bits - self.qq_zero_sym = qq_zero_sym - self.qq_groupsize = qq_groupsize - self.outlier_threshold = outlier_threshold - self.simplified_outliers = simplified_outliers - self.save = save - self.save_safetensors = save_safetensors - self.benchmark = benchmark - self.check = check - self.skip_out_loss = skip_out_loss - self.offload_activations = offload_activations - self.dtype = dtype \ No newline at end of file diff --git a/open_gpt/__init__.py b/run_gpt/__init__.py similarity index 100% rename from open_gpt/__init__.py rename to run_gpt/__init__.py diff --git a/open_gpt/__main__.py b/run_gpt/__main__.py similarity index 57% rename from open_gpt/__main__.py rename to run_gpt/__main__.py index 07fb413..5379a97 100644 --- a/open_gpt/__main__.py +++ b/run_gpt/__main__.py @@ -1,6 +1,6 @@ import sys if __name__ == "__main__": - from open_gpt.cli.application import main + from run_gpt.cli.application import main sys.exit(main()) diff --git a/open_gpt/cli/__init__.py b/run_gpt/cli/__init__.py similarity index 100% rename from open_gpt/cli/__init__.py rename to run_gpt/cli/__init__.py diff --git a/open_gpt/cli/application.py b/run_gpt/cli/application.py similarity index 80% rename from open_gpt/cli/application.py rename to run_gpt/cli/application.py index 75513a3..4061b79 100644 --- a/open_gpt/cli/application.py +++ b/run_gpt/cli/application.py @@ -4,14 +4,14 @@ from cleo.application import Application as BaseApplication from cleo.commands.command import Command -from open_gpt import __version__ -from open_gpt.cli.command_loader import CommandLoader +from run_gpt import __version__ +from run_gpt.cli.command_loader import CommandLoader def load_command(name: str) -> Callable[[], Command]: def _load() -> Command: words = name.split(" ") - module = import_module("open_gpt.cli.commands." + ".".join(words)) + module = import_module("run_gpt.cli.commands." + ".".join(words)) command_class = getattr(module, "".join(c.title() for c in words) + "Command") command: Command = command_class() return command @@ -30,7 +30,7 @@ def _load() -> Command: class Application(BaseApplication): def __init__(self) -> None: - super().__init__("opengpt", __version__) + super().__init__("rungpt", __version__) command_loader = CommandLoader({name: load_command(name) for name in COMMANDS}) self.set_command_loader(command_loader) diff --git a/open_gpt/cli/command_loader.py b/run_gpt/cli/command_loader.py similarity index 100% rename from open_gpt/cli/command_loader.py rename to run_gpt/cli/command_loader.py diff --git a/open_gpt/cli/commands/__init__.py b/run_gpt/cli/commands/__init__.py similarity index 100% rename from open_gpt/cli/commands/__init__.py rename to run_gpt/cli/commands/__init__.py diff --git a/run_gpt/cli/commands/about.py b/run_gpt/cli/commands/about.py new file mode 100644 index 0000000..2d7081e --- /dev/null +++ b/run_gpt/cli/commands/about.py @@ -0,0 +1,24 @@ +from cleo.commands.command import Command + + +class AboutCommand(Command): + name = "about" + + description = "Shows information about RunGPT." + + def handle(self) -> int: + from open_gpt import __version__ + + self.line( + f"""\ +RunGPT - An open-source cloud-native model serving framework. + +Version: {__version__} + +RunGPT is a open-source cloud-native model serving framework\ + and libraries. +See https://github.com/jina-ai/RunGPT for more information.\ +""" + ) + + return 0 diff --git a/open_gpt/cli/commands/deploy.py b/run_gpt/cli/commands/deploy.py similarity index 92% rename from open_gpt/cli/commands/deploy.py rename to run_gpt/cli/commands/deploy.py index 56c583f..7c0e823 100644 --- a/open_gpt/cli/commands/deploy.py +++ b/run_gpt/cli/commands/deploy.py @@ -54,13 +54,13 @@ class DeployCommand(Command): To start a model deploying, you can run: - opengpt deploy stabilityai/stablelm-tuned-alpha-3b""" + rungpt deploy stabilityai/stablelm-tuned-alpha-3b""" def handle(self) -> int: if self.option('cloud') == 'jina': - from open_gpt.factory import create_flow - from open_gpt.helper import asyncify - from open_gpt.serve.flow import deploy + from run_gpt.factory import create_flow + from run_gpt.helper import asyncify + from run_gpt.serve.flow import deploy if self.option('config') is None: flow_yaml = create_flow( diff --git a/open_gpt/cli/commands/playground.py b/run_gpt/cli/commands/playground.py similarity index 85% rename from open_gpt/cli/commands/playground.py rename to run_gpt/cli/commands/playground.py index dab6fa6..5a07708 100644 --- a/open_gpt/cli/commands/playground.py +++ b/run_gpt/cli/commands/playground.py @@ -19,10 +19,10 @@ class PlaygroundCommand(Command): To start a online playground for a model, you can run: - opengpt playground facebook/llama-7b""" + rungpt playground facebook/llama-7b""" def handle(self) -> int: - from open_gpt.serve.playground.gradio import create_playground + from run_gpt.serve.playground.gradio import create_playground playground = create_playground() playground.queue( diff --git a/open_gpt/cli/commands/quantize.py b/run_gpt/cli/commands/quantize.py similarity index 70% rename from open_gpt/cli/commands/quantize.py rename to run_gpt/cli/commands/quantize.py index 11cf1d9..4256c76 100644 --- a/open_gpt/cli/commands/quantize.py +++ b/run_gpt/cli/commands/quantize.py @@ -11,15 +11,12 @@ class QuantizeCommand(Command): options = [ option( - 'target', - None, - 'The path to quantized checkpoint.', - flag=False, - default='' + 'target', None, 'The path to quantized checkpoint.', flag=False, default='' ) ] def handle(self) -> int: - from open_gpt.spqr import quantize + from run_gpt.spqr import quantize + _, _ = quantize(self.argument('model_name'), self.option('target')) return 0 diff --git a/open_gpt/cli/commands/serve.py b/run_gpt/cli/commands/serve.py similarity index 94% rename from open_gpt/cli/commands/serve.py rename to run_gpt/cli/commands/serve.py index edb8ea3..3a847cd 100644 --- a/open_gpt/cli/commands/serve.py +++ b/run_gpt/cli/commands/serve.py @@ -51,10 +51,10 @@ class ServeCommand(Command): To start a model serving locally, you can run: - opengpt serve stabilityai/stablelm-tuned-alpha-3b""" + rungpt serve stabilityai/stablelm-tuned-alpha-3b""" def handle(self) -> int: - from open_gpt.factory import create_flow + from run_gpt.factory import create_flow with create_flow( model_name_or_path=self.argument('model_name'), diff --git a/open_gpt/factory.py b/run_gpt/factory.py similarity index 95% rename from open_gpt/factory.py rename to run_gpt/factory.py index 33ed021..71ee8ec 100644 --- a/open_gpt/factory.py +++ b/run_gpt/factory.py @@ -157,8 +157,8 @@ def create_flow( ): from jina import Flow - from open_gpt import __jina_version__, __version__ - from open_gpt.serve.flow import get_template + from run_gpt import __jina_version__, __version__ + from run_gpt.serve.flow import get_template # normalize the model name to be used as flow executor name norm_name = model_name_or_path.split('/')[-1] @@ -182,14 +182,14 @@ def create_flow( 'gateway_params': {'cors': cors}, 'jina_version': __jina_version__, 'replicas': replicas, - 'labels': {'app': 'open_gpt', 'version': __VERSION_TAG__}, + 'labels': {'app': 'run_gpt', 'version': __VERSION_TAG__}, } yaml = get_template('flow.yml.jinja2').render( dockerized=dockerized, - gateway_image=f'docker://jinaai/open_gpt_gateway:{__VERSION_TAG__}', + gateway_image=f'docker://jinaai/run_gpt_gateway:{__VERSION_TAG__}', gateway_module='Gateway', - executor_image=f'docker://jinaai/open_gpt_executor:{__VERSION_TAG__}', + executor_image=f'docker://jinaai/run_gpt_executor:{__VERSION_TAG__}', executor_module='CausualLMExecutor' if 'flamingo' not in model_name_or_path else 'FlamingoExecutor', diff --git a/open_gpt/helper.py b/run_gpt/helper.py similarity index 100% rename from open_gpt/helper.py rename to run_gpt/helper.py diff --git a/open_gpt/logs.py b/run_gpt/logs.py similarity index 100% rename from open_gpt/logs.py rename to run_gpt/logs.py diff --git a/open_gpt/models/__init__.py b/run_gpt/models/__init__.py similarity index 100% rename from open_gpt/models/__init__.py rename to run_gpt/models/__init__.py diff --git a/run_gpt/models/chat.py b/run_gpt/models/chat.py new file mode 100644 index 0000000..746d67b --- /dev/null +++ b/run_gpt/models/chat.py @@ -0,0 +1,107 @@ +from typing import List, Optional + +import torch + +MAX_LENGTH = 2048 + + +class ChatMixin: + """Mixin for chat methods.""" + + model: 'AutoModelForCausalLM' + tokenizer: 'AutoTokenizer' + + @torch.inference_mode() + def chat( + self, + messages: List[dict], + max_new_tokens: Optional[int] = None, + num_beams: int = 1, + do_sample: bool = False, + temperature: float = 1.0, + top_k: int = 1, + top_p: float = 0.9, + repetition_penalty: float = 1.0, + length_penalty: float = 1.0, + no_repeat_ngram_size: int = 0, + echo: bool = False, + **kwargs + ): + """Generate text from the given prompt. + + :param messages: A list of messages comprising the conversation so far. + :param max_new_tokens: The maximum number of tokens to generate, not including the prompt. + :param num_beams: Number of beams for beam search. 1 means no beam search. + :param do_sample: Whether to use sampling instead of greedy decoding. + :param temperature: The temperature to use for sampling. Only relevant if do_sample is True. Higher means more stochastic. + :param top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Only relevant if do_sample is True. + :param top_p: The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Only relevant if do_sample is True. + :param repetition_penalty: The parameter for repetition penalty. 1.0 means no penalty. + :param length_penalty: Exponential penalty to the length that is used with beam-based generation. + It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. + Since the score is the log likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, + while length_penalty < 0.0 encourages shorter sequences. + :param no_repeat_ngram_size: If set to int > 0, all ngrams of that size can only occur once. + :param echo: Whether to echo the prompt in the generated text. + """ + + # normalize input + prompt = self.create_prompt_for_chat(messages) + completion_response = self.generate( + prompt=prompt, + max_new_tokens=max_new_tokens, + num_beams=num_beams, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + length_penalty=length_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + echo=echo, + **kwargs + ) + # normalize output + choices = completion_response.pop('choices') + return { + 'choices': [ + { + 'index': 0, + 'message': {'role': 'assistant', 'content': choices[0]['text']}, + 'finish_reason': choices[0]['finish_reason'], + } + ], + **completion_response, + } + + @torch.inference_mode() + def step_chat( + self, + messages: Optional[List[dict]] = None, + input_ids: Optional[List[int]] = None, + **kwargs + ): + if messages is None and input_ids is None: + raise ValueError("Either messages or input_ids must be provided.") + if messages and input_ids: + raise ValueError("Only one of messages or input_ids can be provided.") + + if messages: + # normalize input + prompt = self.create_prompt_for_chat(messages) + completion_response = self.step_generate(prompt=prompt, **kwargs) + else: + completion_response = self.step_generate(input_ids=input_ids, **kwargs) + # normalize output + for response in completion_response: + choices = response.pop('choices') + yield { + 'choices': [ + { + 'index': 0, + 'message': {'role': 'assistant', 'content': choices[0]['text']}, + 'finish_reason': choices[0]['finish_reason'], + } + ], + **response, + } diff --git a/open_gpt/models/embedding.py b/run_gpt/models/embedding.py similarity index 100% rename from open_gpt/models/embedding.py rename to run_gpt/models/embedding.py diff --git a/open_gpt/models/flamingo/__init__.py b/run_gpt/models/flamingo/__init__.py similarity index 100% rename from open_gpt/models/flamingo/__init__.py rename to run_gpt/models/flamingo/__init__.py diff --git a/open_gpt/models/flamingo/config.json b/run_gpt/models/flamingo/config.json similarity index 100% rename from open_gpt/models/flamingo/config.json rename to run_gpt/models/flamingo/config.json diff --git a/open_gpt/models/flamingo/configuration_flamingo.py b/run_gpt/models/flamingo/configuration_flamingo.py similarity index 100% rename from open_gpt/models/flamingo/configuration_flamingo.py rename to run_gpt/models/flamingo/configuration_flamingo.py diff --git a/open_gpt/models/flamingo/flamingo_lm.py b/run_gpt/models/flamingo/flamingo_lm.py similarity index 100% rename from open_gpt/models/flamingo/flamingo_lm.py rename to run_gpt/models/flamingo/flamingo_lm.py diff --git a/open_gpt/models/flamingo/flamingo_model.py b/run_gpt/models/flamingo/flamingo_model.py similarity index 99% rename from open_gpt/models/flamingo/flamingo_model.py rename to run_gpt/models/flamingo/flamingo_model.py index 92e9abb..fa8cebe 100644 --- a/open_gpt/models/flamingo/flamingo_model.py +++ b/run_gpt/models/flamingo/flamingo_model.py @@ -5,7 +5,7 @@ from open_flamingo.src.helpers import PerceiverResampler from torch import nn -from open_gpt.logs import logger +from run_gpt.logs import logger from ...helper import auto_dtype_and_device diff --git a/open_gpt/models/flamingo/loading.py b/run_gpt/models/flamingo/loading.py similarity index 99% rename from open_gpt/models/flamingo/loading.py rename to run_gpt/models/flamingo/loading.py index 982ace1..88c8dd0 100644 --- a/open_gpt/models/flamingo/loading.py +++ b/run_gpt/models/flamingo/loading.py @@ -4,7 +4,7 @@ import torch from open_flamingo.src.utils import extend_instance -from open_gpt.logs import logger +from run_gpt.logs import logger def load_model_and_transforms( diff --git a/open_gpt/models/flamingo/modeling.py b/run_gpt/models/flamingo/modeling.py similarity index 98% rename from open_gpt/models/flamingo/modeling.py rename to run_gpt/models/flamingo/modeling.py index 7b312dc..d47e270 100644 --- a/open_gpt/models/flamingo/modeling.py +++ b/run_gpt/models/flamingo/modeling.py @@ -3,7 +3,7 @@ import torch from PIL import Image -from open_gpt.models.modeling import BaseModel +from run_gpt.models.modeling import BaseModel class FlamingoModel(BaseModel): diff --git a/open_gpt/models/generation.py b/run_gpt/models/generation.py similarity index 100% rename from open_gpt/models/generation.py rename to run_gpt/models/generation.py diff --git a/open_gpt/models/llama/__init__.py b/run_gpt/models/llama/__init__.py similarity index 100% rename from open_gpt/models/llama/__init__.py rename to run_gpt/models/llama/__init__.py diff --git a/open_gpt/models/llama/loading.py b/run_gpt/models/llama/loading.py similarity index 95% rename from open_gpt/models/llama/loading.py rename to run_gpt/models/llama/loading.py index 6adf371..1ae5069 100644 --- a/open_gpt/models/llama/loading.py +++ b/run_gpt/models/llama/loading.py @@ -2,8 +2,8 @@ import torch -from open_gpt.helper import get_device_map -from open_gpt.logs import logger +from run_gpt.helper import get_device_map +from run_gpt.logs import logger def load_model_and_tokenizer( @@ -51,7 +51,7 @@ def load_model_and_tokenizer( from packaging import version from transformers import BitsAndBytesConfig - from open_gpt import importlib_metadata + from run_gpt import importlib_metadata trf_version = importlib_metadata.version("transformers") if 'dev' in trf_version: diff --git a/open_gpt/models/llama/modeling.py b/run_gpt/models/llama/modeling.py similarity index 94% rename from open_gpt/models/llama/modeling.py rename to run_gpt/models/llama/modeling.py index 679d967..b5e1ed3 100644 --- a/open_gpt/models/llama/modeling.py +++ b/run_gpt/models/llama/modeling.py @@ -2,7 +2,7 @@ import torch -from open_gpt.models.modeling import BaseModel +from run_gpt.models.modeling import BaseModel class LlamaModel(BaseModel): diff --git a/open_gpt/models/loading.py b/run_gpt/models/loading.py similarity index 95% rename from open_gpt/models/loading.py rename to run_gpt/models/loading.py index 9731de1..847275d 100644 --- a/open_gpt/models/loading.py +++ b/run_gpt/models/loading.py @@ -2,8 +2,8 @@ import torch -from open_gpt.helper import _DEFAULT_FP16_DTYPE, get_device_map -from open_gpt.logs import logger +from run_gpt.helper import _DEFAULT_FP16_DTYPE, get_device_map +from run_gpt.logs import logger def load_model_and_tokenizer( @@ -50,7 +50,7 @@ def load_model_and_tokenizer( elif precision == 'bit4': from packaging import version - from open_gpt import importlib_metadata + from run_gpt import importlib_metadata trf_version = importlib_metadata.version("transformers") if 'dev' in trf_version: diff --git a/open_gpt/models/modeling.py b/run_gpt/models/modeling.py similarity index 100% rename from open_gpt/models/modeling.py rename to run_gpt/models/modeling.py diff --git a/open_gpt/models/moss/__init__.py b/run_gpt/models/moss/__init__.py similarity index 100% rename from open_gpt/models/moss/__init__.py rename to run_gpt/models/moss/__init__.py diff --git a/open_gpt/models/moss/modeling.py b/run_gpt/models/moss/modeling.py similarity index 93% rename from open_gpt/models/moss/modeling.py rename to run_gpt/models/moss/modeling.py index 0396e24..cb2f7ff 100644 --- a/open_gpt/models/moss/modeling.py +++ b/run_gpt/models/moss/modeling.py @@ -1,7 +1,7 @@ from typing import List -from open_gpt.models.modeling import BaseModel -from open_gpt.logs import logger +from run_gpt.logs import logger +from run_gpt.models.modeling import BaseModel class MossModel(BaseModel): @@ -16,9 +16,9 @@ class MossModel(BaseModel): The quick usage is summarized in the snippet below: ```python - import open_gpt + import run_gpt - model = open_gpt.create_model( + model = run_gpt.create_model( 'fnlp/moss-moon-003-sft', precision='fp16', device_map='balanced' ) @@ -75,7 +75,9 @@ def create_prompt_for_chat(self, messages: List[dict]) -> str: content = message['content'] if role == 'system': - logger.warning('System message detected, but MOSS has a specific system instruction, will skip ...') + logger.warning( + 'System message detected, but MOSS has a specific system instruction, will skip ...' + ) elif role == 'user': string_messages += f'<|Human|>: {content}\n' elif role == 'assistant': diff --git a/open_gpt/models/pythia/__init__.py b/run_gpt/models/pythia/__init__.py similarity index 100% rename from open_gpt/models/pythia/__init__.py rename to run_gpt/models/pythia/__init__.py diff --git a/open_gpt/models/pythia/modeling.py b/run_gpt/models/pythia/modeling.py similarity index 74% rename from open_gpt/models/pythia/modeling.py rename to run_gpt/models/pythia/modeling.py index bfe138d..ada5e9e 100644 --- a/open_gpt/models/pythia/modeling.py +++ b/run_gpt/models/pythia/modeling.py @@ -1,4 +1,4 @@ -from open_gpt.models.modeling import BaseModel +from run_gpt.models.modeling import BaseModel class PythiaModel(BaseModel): @@ -9,12 +9,12 @@ class PythiaModel(BaseModel): See https://github.com/EleutherAI/pythia for more information. - The quick way to use Pythia via :meth:`open_gpt.create_model`: + The quick way to use Pythia via :meth:`run_gpt.create_model`: ```python - import open_gpt + import run_gpt - model = open_gpt.create_model( + model = run_gpt.create_model( 'EleutherAI/pythia-12b-deduped', precision='fp16', device_map='balanced' ) ``` diff --git a/open_gpt/models/rwkv/__init__.py b/run_gpt/models/rwkv/__init__.py similarity index 100% rename from open_gpt/models/rwkv/__init__.py rename to run_gpt/models/rwkv/__init__.py diff --git a/open_gpt/models/rwkv/modeling.py b/run_gpt/models/rwkv/modeling.py similarity index 96% rename from open_gpt/models/rwkv/modeling.py rename to run_gpt/models/rwkv/modeling.py index a573c2f..639873c 100644 --- a/open_gpt/models/rwkv/modeling.py +++ b/run_gpt/models/rwkv/modeling.py @@ -3,7 +3,7 @@ import torch import torch.nn.functional as F -from open_gpt.models.modeling import BaseModel +from run_gpt.models.modeling import BaseModel class RWKVModel(BaseModel): diff --git a/open_gpt/models/session.py b/run_gpt/models/session.py similarity index 100% rename from open_gpt/models/session.py rename to run_gpt/models/session.py diff --git a/open_gpt/models/stablelm/__init__.py b/run_gpt/models/stablelm/__init__.py similarity index 100% rename from open_gpt/models/stablelm/__init__.py rename to run_gpt/models/stablelm/__init__.py diff --git a/open_gpt/models/stablelm/modeling.py b/run_gpt/models/stablelm/modeling.py similarity index 87% rename from open_gpt/models/stablelm/modeling.py rename to run_gpt/models/stablelm/modeling.py index 4b47362..1159a82 100644 --- a/open_gpt/models/stablelm/modeling.py +++ b/run_gpt/models/stablelm/modeling.py @@ -10,8 +10,8 @@ import torch from transformers import StoppingCriteria, StoppingCriteriaList -from open_gpt.models.modeling import BaseModel -from open_gpt.logs import logger +from run_gpt.logs import logger +from run_gpt.models.modeling import BaseModel class StopOnTokens(StoppingCriteria): @@ -30,12 +30,12 @@ class StableLMModel(BaseModel): See https://github.com/Stability-AI/StableLM for more details. - The quick way to use StableLM via :meth:`open_gpt.create_model`: + The quick way to use StableLM via :meth:`run_gpt.create_model`: ```python - import open_gpt + import run_gpt - model = open_gpt.create_model('stabilityai/stablelm-tuned-alpha-7b') + model = run_gpt.create_model('stabilityai/stablelm-tuned-alpha-7b') system_prompt = ( '<|SYSTEM|># StableLM Tuned (Alpha version)\n' @@ -63,7 +63,7 @@ class StableLMModel(BaseModel): ) # Generate text with StableLM-StableVicuna-13B - model = open_gpt.create_model('CarperAI/stable-vicuna-13b-delta') + model = run_gpt.create_model('CarperAI/stable-vicuna-13b-delta') ``` """ @@ -89,7 +89,7 @@ def generate(self, prompts: Union[str, List[str]], **kwargs): if not self.is_vicuna_model else None, skip_special_tokens=False, - **kwargs + **kwargs, ) def create_prompt_for_chat(self, messages: List[dict]) -> str: @@ -99,7 +99,9 @@ def create_prompt_for_chat(self, messages: List[dict]) -> str: content = message['content'] if role == 'system': - logger.warning('System message detected, but StableLM has a specific system instruction, will skip ...') + logger.warning( + 'System message detected, but StableLM has a specific system instruction, will skip ...' + ) elif role == 'user': string_messages += f'<|USER|>{content}' elif role == 'assistant': diff --git a/open_gpt/models/vicuna/__init__.py b/run_gpt/models/vicuna/__init__.py similarity index 100% rename from open_gpt/models/vicuna/__init__.py rename to run_gpt/models/vicuna/__init__.py diff --git a/open_gpt/models/vicuna/loading.py b/run_gpt/models/vicuna/loading.py similarity index 98% rename from open_gpt/models/vicuna/loading.py rename to run_gpt/models/vicuna/loading.py index 2491dae..34e9503 100644 --- a/open_gpt/models/vicuna/loading.py +++ b/run_gpt/models/vicuna/loading.py @@ -2,7 +2,7 @@ import torch -from open_gpt.logs import logger +from run_gpt.logs import logger def load_model_and_tokenizer( @@ -26,7 +26,7 @@ def load_model_and_tokenizer( if precision in ['bit4', 'bit8']: from packaging import version - from open_gpt import importlib_metadata + from run_gpt import importlib_metadata trf_version = importlib_metadata.version("transformers") if 'dev' in trf_version: diff --git a/open_gpt/models/vicuna/modeling.py b/run_gpt/models/vicuna/modeling.py similarity index 86% rename from open_gpt/models/vicuna/modeling.py rename to run_gpt/models/vicuna/modeling.py index d7cdd61..845f490 100644 --- a/open_gpt/models/vicuna/modeling.py +++ b/run_gpt/models/vicuna/modeling.py @@ -2,7 +2,7 @@ import torch -from open_gpt.models.llama.modeling import LlamaModel +from run_gpt.models.llama.modeling import LlamaModel class VicunaModel(LlamaModel): @@ -11,12 +11,12 @@ class VicunaModel(LlamaModel): Vicuna is trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT. See https://vicuna.lmsys.org/ for more details. - The quick way to use Vicuna via :meth:`open_gpt.create_model`: + The quick way to use Vicuna via :meth:`run_gpt.create_model`: ```python - import open_gpt + import run_gpt - model = open_gpt.create_model('lmsys/vicuna-7b-delta-v1.1') + model = run_gpt.create_model('lmsys/vicuna-7b-delta-v1.1') # Generate text text_out = model.generate_text(prompts='Hello, my name is', max_length=50) @@ -25,9 +25,9 @@ class VicunaModel(LlamaModel): If you want to run inference with lower precision and/or on a specific device, you can do: ```python - import open_gpt + import run_gpt - model = open_gpt.create_model( + model = run_gpt.create_model( 'lmsys/vicuna-7b-delta-v1.1', precision='fp16', device_map='balanced' ) ``` diff --git a/open_gpt/profile.py b/run_gpt/profile.py similarity index 93% rename from open_gpt/profile.py rename to run_gpt/profile.py index 2dedf84..8f3887c 100644 --- a/open_gpt/profile.py +++ b/run_gpt/profile.py @@ -117,13 +117,11 @@ def end_measure(start_measures): # GPU mem for i in range(torch.cuda.device_count()): measures[str(i)] = ( - torch.cuda.memory_allocated(i) - start_measures[ - str(i)] - ) / GB + torch.cuda.memory_allocated(i) - start_measures[str(i)] + ) / GB measures[f"{i}-peak"] = ( - torch.cuda.max_memory_allocated(i) - - start_measures[str(i)] - ) / GB + torch.cuda.max_memory_allocated(i) - start_measures[str(i)] + ) / GB return measures @@ -167,8 +165,12 @@ def end_record(self, generation_outputs: Union[str, List[str]]): ) else: num_tokens = sum( - list(map(lambda x: len(self._tokenizer(x)['input_ids']) - 2, - generation_outputs)) + list( + map( + lambda x: len(self._tokenizer(x)['input_ids']) - 2, + generation_outputs, + ) + ) ) self._generation_length.append(num_tokens) self._time_stamp = None diff --git a/open_gpt/resources/flow.yml.jinja2 b/run_gpt/resources/flow.yml.jinja2 similarity index 94% rename from open_gpt/resources/flow.yml.jinja2 rename to run_gpt/resources/flow.yml.jinja2 index b0561d0..aca4ace 100644 --- a/open_gpt/resources/flow.yml.jinja2 +++ b/run_gpt/resources/flow.yml.jinja2 @@ -16,7 +16,7 @@ gateway: uses: jtype: {{ gateway_module }} py_modules: - - open_gpt.serve.gateway + - run_gpt.serve.gateway {% else -%} uses: {{ gateway_image }} {% endif -%} @@ -36,7 +36,7 @@ executors: uses: jtype: {{ executor_module }} py_modules: - - open_gpt.serve.executors + - run_gpt.serve.executors {% else -%} uses: {{ executor_image }} {% endif -%} diff --git a/open_gpt/serve/__init__.py b/run_gpt/serve/__init__.py similarity index 100% rename from open_gpt/serve/__init__.py rename to run_gpt/serve/__init__.py diff --git a/open_gpt/serve/executors/__init__.py b/run_gpt/serve/executors/__init__.py similarity index 100% rename from open_gpt/serve/executors/__init__.py rename to run_gpt/serve/executors/__init__.py diff --git a/open_gpt/serve/executors/base.py b/run_gpt/serve/executors/base.py similarity index 98% rename from open_gpt/serve/executors/base.py rename to run_gpt/serve/executors/base.py index 964d02c..2783d9b 100644 --- a/open_gpt/serve/executors/base.py +++ b/run_gpt/serve/executors/base.py @@ -6,8 +6,8 @@ from docarray import DocumentArray from jina import Executor, requests -from open_gpt.factory import create_model -from open_gpt.logs import logger +from run_gpt.factory import create_model +from run_gpt.logs import logger class CausualLMExecutor(Executor): diff --git a/open_gpt/serve/executors/flamingo.py b/run_gpt/serve/executors/flamingo.py similarity index 96% rename from open_gpt/serve/executors/flamingo.py rename to run_gpt/serve/executors/flamingo.py index f988f5b..aa3cd01 100644 --- a/open_gpt/serve/executors/flamingo.py +++ b/run_gpt/serve/executors/flamingo.py @@ -7,8 +7,8 @@ from docarray import DocumentArray from jina import Executor, requests -import open_gpt -from open_gpt.logs import logger +import run_gpt +from run_gpt.logs import logger class FlamingoExecutor(Executor): @@ -38,7 +38,7 @@ def __init__( '`device_map` is not supported in FlamingoExecutor. Ignored.' ) - self.model = open_gpt.create_model( + self.model = run_gpt.create_model( model_name_or_path, precision=precision, device_map=None, **kwargs ) diff --git a/open_gpt/serve/executors/utils.py b/run_gpt/serve/executors/utils.py similarity index 100% rename from open_gpt/serve/executors/utils.py rename to run_gpt/serve/executors/utils.py diff --git a/open_gpt/serve/flow.py b/run_gpt/serve/flow.py similarity index 93% rename from open_gpt/serve/flow.py rename to run_gpt/serve/flow.py index e24133d..a17a2c7 100644 --- a/open_gpt/serve/flow.py +++ b/run_gpt/serve/flow.py @@ -7,7 +7,7 @@ @cache def get_template(template): if template.endswith('.jinja2'): - from open_gpt import __resources_path__ + from run_gpt import __resources_path__ env = Environment(loader=FileSystemLoader(__resources_path__)) return env.get_template(template) diff --git a/open_gpt/serve/gateway.py b/run_gpt/serve/gateway.py similarity index 73% rename from open_gpt/serve/gateway.py rename to run_gpt/serve/gateway.py index aa007d7..fc08077 100644 --- a/open_gpt/serve/gateway.py +++ b/run_gpt/serve/gateway.py @@ -1,13 +1,19 @@ """The serve module provides a simple way to serve a model using Jina.""" +from datetime import datetime + import jina +from fastapi.encoders import jsonable_encoder from jina import Document, DocumentArray from jina import Gateway as BaseGateway from jina.serve.runtimes.servers.composite import CompositeServer -from .pydantic_models import GenerateRequest, ChatRequest, BaseResponse, ResponseObjectEnum -from fastapi.encoders import jsonable_encoder -from datetime import datetime +from .pydantic_models import ( + BaseResponse, + ChatRequest, + GenerateRequest, + ResponseObjectEnum, +) class Gateway(BaseGateway, CompositeServer): @@ -47,15 +53,15 @@ async def generate(payload: GenerateRequest = Body(...)): parameters = _update_key(parameters) async for docs, error in self.streamer.stream( - docs=DocumentArray( - [ - Document( - tags={'prompt': payload.prompt}, - ) - ] - ), - exec_endpoint='/generate', - parameters=parameters, + docs=DocumentArray( + [ + Document( + tags={'prompt': payload.prompt}, + ) + ] + ), + exec_endpoint='/generate', + parameters=parameters, ): if error: return JSONResponse( @@ -68,9 +74,13 @@ async def generate(payload: GenerateRequest = Body(...)): return JSONResponse( status_code=status.HTTP_200_OK, - content=jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.GENERATION, - created=int(datetime.now().timestamp())) - ) + content=jsonable_encoder( + BaseResponse( + **_tags, + object=ResponseObjectEnum.GENERATION, + created=int(datetime.now().timestamp()) + ) + ), ) @app.api_route(path='/generate_stream', methods=['POST']) @@ -97,9 +107,9 @@ async def event_generator(): parameters['completion_tokens'] = completion_tokens async for docs, error in self.streamer.stream( - docs=input_docs, - exec_endpoint='/generate_stream', - parameters=parameters, + docs=input_docs, + exec_endpoint='/generate_stream', + parameters=parameters, ): if error: # TODO: find best practice to handle errors in sse @@ -113,9 +123,9 @@ async def event_generator(): stop_flag = docs[0].tags.get('choices')[0].get( 'finish_reason' ) in [ - 'stop', - 'length', - ] + 'stop', + 'length', + ] completion_tokens += 1 _tags = docs[0].tags.copy() @@ -124,9 +134,15 @@ async def event_generator(): _tags['usage'] = { k: int(v) for k, v in _tags['usage'].items() } - yield {"data": jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.GENERATION, - created=int(datetime.now().timestamp())) - )} + yield { + "data": jsonable_encoder( + BaseResponse( + **_tags, + object=ResponseObjectEnum.GENERATION, + created=int(datetime.now().timestamp()) + ) + ) + } input_docs = DocumentArray( [ @@ -151,15 +167,15 @@ async def chat(payload: ChatRequest = Body(...)): parameters = _update_key(parameters) async for docs, error in self.streamer.stream( - docs=DocumentArray( - [ - Document( - tags={'prompt': payload.messages}, - ) - ] - ), - exec_endpoint='/chat', - parameters=parameters, + docs=DocumentArray( + [ + Document( + tags={'prompt': payload.messages}, + ) + ] + ), + exec_endpoint='/chat', + parameters=parameters, ): if error: return JSONResponse( @@ -172,9 +188,13 @@ async def chat(payload: ChatRequest = Body(...)): return JSONResponse( status_code=status.HTTP_200_OK, - content=jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.CHAT, - created=int(datetime.now().timestamp())) - ), + content=jsonable_encoder( + BaseResponse( + **_tags, + object=ResponseObjectEnum.CHAT, + created=int(datetime.now().timestamp()) + ) + ), ) @app.api_route(path='/chat_stream', methods=['POST']) @@ -201,9 +221,9 @@ async def event_generator(): parameters['completion_tokens'] = completion_tokens async for docs, error in self.streamer.stream( - docs=input_docs, - exec_endpoint='/chat_stream', - parameters=parameters, + docs=input_docs, + exec_endpoint='/chat_stream', + parameters=parameters, ): if error: # TODO: find best practice to handle errors in sse @@ -217,9 +237,9 @@ async def event_generator(): stop_flag = docs[0].tags.get('choices')[0].get( 'finish_reason' ) in [ - 'stop', - 'length', - ] + 'stop', + 'length', + ] completion_tokens += 1 _tags = docs[0].tags.copy() @@ -228,9 +248,15 @@ async def event_generator(): _tags['usage'] = { k: int(v) for k, v in _tags['usage'].items() } - yield {"data": jsonable_encoder(BaseResponse(**_tags, object=ResponseObjectEnum.CHAT, - created=int(datetime.now().timestamp())) - )} + yield { + "data": jsonable_encoder( + BaseResponse( + **_tags, + object=ResponseObjectEnum.CHAT, + created=int(datetime.now().timestamp()) + ) + ) + } input_docs = DocumentArray( [ diff --git a/open_gpt/serve/playground/__init__.py b/run_gpt/serve/playground/__init__.py similarity index 100% rename from open_gpt/serve/playground/__init__.py rename to run_gpt/serve/playground/__init__.py diff --git a/open_gpt/serve/playground/gradio.py b/run_gpt/serve/playground/gradio.py similarity index 98% rename from open_gpt/serve/playground/gradio.py rename to run_gpt/serve/playground/gradio.py index 58b1b28..54e5899 100644 --- a/open_gpt/serve/playground/gradio.py +++ b/run_gpt/serve/playground/gradio.py @@ -23,7 +23,7 @@ title_markdown = """ # ☄️ Chat with Large-scale Multimodal Models -[[Project Page]](https://opengpt.github.io) [[Code]](https://github.com/jinaai/opengpt) +[[Project Page]](https://rungpt.github.io) [[Code]](https://github.com/jinaai/rungpt) """ _get_window_url_params = """ diff --git a/open_gpt/serve/playground/gradio_chatbot.py b/run_gpt/serve/playground/gradio_chatbot.py similarity index 100% rename from open_gpt/serve/playground/gradio_chatbot.py rename to run_gpt/serve/playground/gradio_chatbot.py diff --git a/open_gpt/serve/playground/gradio_css.py b/run_gpt/serve/playground/gradio_css.py similarity index 100% rename from open_gpt/serve/playground/gradio_css.py rename to run_gpt/serve/playground/gradio_css.py diff --git a/open_gpt/serve/pydantic_models.py b/run_gpt/serve/pydantic_models.py similarity index 54% rename from open_gpt/serve/pydantic_models.py rename to run_gpt/serve/pydantic_models.py index abe11f9..dc5839d 100644 --- a/open_gpt/serve/pydantic_models.py +++ b/run_gpt/serve/pydantic_models.py @@ -1,6 +1,7 @@ -from pydantic import BaseModel, Field from enum import Enum -from typing import Union, List, Tuple, Any +from typing import Any, List, Tuple, Union + +from pydantic import BaseModel, Field class BaseRequest(BaseModel): @@ -25,29 +26,41 @@ class BaseRequest(BaseModel): ) logprobs: int = Field( description='Include the log probabilities on the logprobs ' - 'most likely tokens, as well the chosen tokens', + 'most likely tokens, as well the chosen tokens', default=None, ) echo: bool = Field( description='Echo back the prompt in the completion.', default=None ) - stop: Union[str, List[str]] = Field(description='Stop sequence generation on token.', default=None) - stop_str: Union[str, List[str]] = Field(description='Stop sequence generation on token.', default=None) + stop: Union[str, List[str]] = Field( + description='Stop sequence generation on token.', default=None + ) + stop_str: Union[str, List[str]] = Field( + description='Stop sequence generation on token.', default=None + ) do_sample: bool = Field( description='Whether to sample from the generation.', default=None ) - presence_penalty: float = Field(description='Positive values penalize new tokens based on whether they appear in ' - 'the text so far, increasing the likelihood to talk about new topics.', - default=0) - frequency_penalty: float = Field(description='Positive values penalize new tokens based on their existing ' - 'frequency in the text so far, decreasing the likelihood to repeat ' - 'the same line verbatim.', - default=0) - best_of: int = Field(description='Generates best_of completions server-side and returns the "best" (the one with ' - 'the highest log probability per token). Results cannot be streamed.', - default=None) + presence_penalty: float = Field( + description='Positive values penalize new tokens based on whether they appear in ' + 'the text so far, increasing the likelihood to talk about new topics.', + default=0, + ) + frequency_penalty: float = Field( + description='Positive values penalize new tokens based on their existing ' + 'frequency in the text so far, decreasing the likelihood to repeat ' + 'the same line verbatim.', + default=0, + ) + best_of: int = Field( + description='Generates best_of completions server-side and returns the "best" (the one with ' + 'the highest log probability per token). Results cannot be streamed.', + default=None, + ) n: int = Field(description='The number of sequences to return.', default=None) - num_return_sequences: int = Field(description='The number of sequences to return.', default=None) + num_return_sequences: int = Field( + description='The number of sequences to return.', default=None + ) class GenerateRequest(BaseRequest): @@ -77,7 +90,7 @@ class Config: 'best_of': 5, 'logprobs': None, 'n': 3, - 'num_return_sequences': 3 + 'num_return_sequences': 3, } } @@ -91,8 +104,10 @@ class Config: schema_extra = { 'example': { - 'messages': [{"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Hello!"}], + 'messages': [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"}, + ], 'id': '18d92585-7b66-4b7c-b818-71287c122c57', 'num_beams': 5, 'max_tokens': 50, @@ -110,7 +125,7 @@ class Config: 'best_of': 5, 'logprobs': None, 'n': 3, - 'num_return_sequences': 3 + 'num_return_sequences': 3, } } @@ -124,18 +139,26 @@ class BaseResponse(BaseModel): # session id id: str = Field(description='The session id of the generation.', default=None) - object: ResponseObjectEnum = Field(description='The task type of the response.', default=None) + object: ResponseObjectEnum = Field( + description='The task type of the response.', default=None + ) created: int = Field(description='The timestamp of the response.', default=None) - choices: List[dict] = Field(description='The generated text. It contains 5 keys: `index`, `text`, `message`, `logprobs`, ' - '`finish_reason`. For generation mode, `message` is None. For chat mode, ' - '`text` is None.') - prompt: str = Field(description='The prompt used to generate the response.', default=None) - usage: dict = Field(description='The usage of the model. It contains 3 keys: `prompt_tokens`, ' - '`completion_tokens`, `total_tokens`. `prompt_tokens` is the length of input, ' - 'in streaming mode this also includes the length of past_key_values. ' - '`completion_tokens` is the length of the generated text, in streaming mode this ' - 'also includes the length of text generated in previous steps.' - '`total_tokens` is the total length of the `prompt_tokens` and `completion_tokens`.') + choices: List[dict] = Field( + description='The generated text. It contains 5 keys: `index`, `text`, `message`, `logprobs`, ' + '`finish_reason`. For generation mode, `message` is None. For chat mode, ' + '`text` is None.' + ) + prompt: str = Field( + description='The prompt used to generate the response.', default=None + ) + usage: dict = Field( + description='The usage of the model. It contains 3 keys: `prompt_tokens`, ' + '`completion_tokens`, `total_tokens`. `prompt_tokens` is the length of input, ' + 'in streaming mode this also includes the length of past_key_values. ' + '`completion_tokens` is the length of the generated text, in streaming mode this ' + 'also includes the length of text generated in previous steps.' + '`total_tokens` is the total length of the `prompt_tokens` and `completion_tokens`.' + ) class Config: allow_population_by_field_name = True @@ -146,15 +169,24 @@ class Config: 'id': '18d92585-7b66-4b7c-b818-71287c122c57', 'object': 'chat.completion', 'create': 12345678, - 'choices': [{"index": 0, - "text": None, - "message": { - "role": "assistant", - "content": "\n\nHello there, how may I assist you today?", - }, - "logprobs": None, - "finish_reason": "length"}], + 'choices': [ + { + "index": 0, + "text": None, + "message": { + "role": "assistant", + "content": "\n\nHello there, how may I assist you today?", + }, + "logprobs": None, + "finish_reason": "length", + } + ], 'prompt': 'Hello there.', - 'usage': {'prompt_tokens': 0, 'input_length': 10, 'completion_tokens': 10, 'total_tokens': 20}, + 'usage': { + 'prompt_tokens': 0, + 'input_length': 10, + 'completion_tokens': 10, + 'total_tokens': 20, + }, } } diff --git a/open_gpt/spqr/datautils.py b/run_gpt/spqr/datautils.py similarity index 90% rename from open_gpt/spqr/datautils.py rename to run_gpt/spqr/datautils.py index 0a89435..7caf7da 100644 --- a/open_gpt/spqr/datautils.py +++ b/run_gpt/spqr/datautils.py @@ -61,10 +61,16 @@ def get_c4(nsamples, seed, seqlen, model_path): from datasets import load_dataset traindata = load_dataset( - "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train" + "allenai/c4", + "allenai--c4", + data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, + split="train", ) valdata = load_dataset( - "allenai/c4", "allenai--c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation" + "allenai/c4", + "allenai--c4", + data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, + split="validation", ) tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False) @@ -138,10 +144,16 @@ def get_c4_new(nsamples, seed, seqlen, model_path): from datasets import load_dataset traindata = load_dataset( - "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train" + "allenai/c4", + "allenai--c4", + data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, + split="train", ) valdata = load_dataset( - "allenai/c4", "allenai--c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation" + "allenai/c4", + "allenai--c4", + data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, + split="validation", ) tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False) diff --git a/open_gpt/spqr/quant_groups.py b/run_gpt/spqr/quant_groups.py similarity index 83% rename from open_gpt/spqr/quant_groups.py rename to run_gpt/spqr/quant_groups.py index 9153923..d5c8812 100644 --- a/open_gpt/spqr/quant_groups.py +++ b/run_gpt/spqr/quant_groups.py @@ -95,17 +95,37 @@ def find_params(self, x, weight=False): if self.qq_scale_bits is not None: scale_groups = self.scale.reshape(-1, self.qq_groupsize) self.qq_scale = Quantizer(shape=scale_groups.shape) - self.qq_scale.configure(self.qq_scale_bits, perchannel=True, sym=False, round_zero=False, **self.qqq_params) + self.qq_scale.configure( + self.qq_scale_bits, + perchannel=True, + sym=False, + round_zero=False, + **self.qqq_params + ) self.qq_scale.find_params(scale_groups, weight=True) - assert self.qq_scale.scale.shape == (scale_groups.shape[0], 1), self.qq_scale.scale.shape + assert self.qq_scale.scale.shape == ( + scale_groups.shape[0], + 1, + ), self.qq_scale.scale.shape self.scale = self.qq_scale.quantize(scale_groups).reshape_as(self.scale) - if self.qq_zero_bits is not None and ((not self.round_zero) or self.qq_zero_bits < self.bits): + if self.qq_zero_bits is not None and ( + (not self.round_zero) or self.qq_zero_bits < self.bits + ): zero_groups = self.zero.reshape(-1, self.qq_groupsize) self.qq_zero = Quantizer(shape=zero_groups.shape) - self.qq_zero.configure(self.qq_zero_bits, perchannel=True, sym=self.qq_zero_sym, round_zero=False, **self.qqq_params) + self.qq_zero.configure( + self.qq_zero_bits, + perchannel=True, + sym=self.qq_zero_sym, + round_zero=False, + **self.qqq_params + ) self.qq_zero.find_params(zero_groups, weight=True) - assert self.qq_zero.scale.shape == (zero_groups.shape[0], 1), self.qq_zero.scale.shape + assert self.qq_zero.scale.shape == ( + zero_groups.shape[0], + 1, + ), self.qq_zero.scale.shape self.zero = self.qq_zero.quantize(zero_groups).reshape_as(self.zero) if weight: diff --git a/open_gpt/spqr/quantize.py b/run_gpt/spqr/quantize.py similarity index 77% rename from open_gpt/spqr/quantize.py rename to run_gpt/spqr/quantize.py index 3fcefd5..5157e89 100644 --- a/open_gpt/spqr/quantize.py +++ b/run_gpt/spqr/quantize.py @@ -1,10 +1,11 @@ # Adapted from https://github.com/Vahe1994/SpQR import time -from quantizeargs import QuantizeArgs + +import huggingface_hub from datautils import * +from quantizeargs import QuantizeArgs from quantutils import * -import huggingface_hub def apply_quantize(args: QuantizeArgs, quantized_model_path: str = None): @@ -17,7 +18,11 @@ def apply_quantize(args: QuantizeArgs, quantized_model_path: str = None): else: assert args.dataset != "custom" dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model_path=args.model_path, seqlen=model.seqlen + args.dataset, + nsamples=args.nsamples, + seed=args.seed, + model_path=args.model_path, + seqlen=model.seqlen, ) device = "cuda" if torch.cuda.is_available() else "cpu" @@ -37,7 +42,9 @@ def test(model_path): datasets = ["wikitext2", "ptb", "c4"] for dataset in datasets: - dataloader, testloader = get_loaders(dataset, seed=0, model_path=model_path, seqlen=model.seqlen) + dataloader, testloader = get_loaders( + dataset, seed=0, model_path=model_path, seqlen=model.seqlen + ) print(dataset) llama_eval(model, testloader, device) @@ -46,6 +53,7 @@ def quant(model_name, quantized_model_path): model_path = huggingface_hub.snapshot_download(model_name) before_args = QuantizeArgs(model_name=model_name, model_path=model_path) apply_quantize(before_args, quantized_model_path) - quantized_args = QuantizeArgs(model_name=model_name,model_path=quantized_model_path) + quantized_args = QuantizeArgs( + model_name=model_name, model_path=quantized_model_path + ) return before_args, quantized_args - diff --git a/run_gpt/spqr/quantizeargs.py b/run_gpt/spqr/quantizeargs.py new file mode 100644 index 0000000..d628210 --- /dev/null +++ b/run_gpt/spqr/quantizeargs.py @@ -0,0 +1,60 @@ +class QuantizeArgs: + def __init__( + self, + model_name, + model_path, + dataset: str = 'wikitext2', + load_from_saved: str = None, + seed: int = 0, + nsamples: int = 128, + percdamp: float = 0.01, + wbits: int = 4, + groupsize: int = 16, + permutation_order: str = "identity", + true_sequential: bool = False, + new_eval: bool = False, + sym: bool = False, + perchannel: bool = True, + qq_scale_bits: int = 3, + round_zero: int = None, + qq_zero_bits: int = 3, + qq_zero_sym: bool = False, + qq_groupsize: int = 16, + outlier_threshold: float = 0.2, + simplified_outliers: bool = False, + save: str = '', + save_safetensors: str = '', + benchmark: int = 0, + check: bool = False, + skip_out_loss: bool = False, + offload_activations: bool = False, + dtype: str = "auto", + ): + self.model_name = model_name + self.model_path = model_path + self.dataset = dataset + self.load_from_saved = load_from_saved + self.seed = seed + self.nsamples = nsamples + self.percdamp = percdamp + self.wbits = wbits + self.groupsize = groupsize + self.permutation_order = permutation_order + self.true_sequential = true_sequential + self.new_eval = new_eval + self.sym = sym + self.perchannel = perchannel + self.qq_scale_bits = qq_scale_bits + self.round_zero = round_zero + self.qq_zero_bits = qq_zero_bits + self.qq_zero_sym = qq_zero_sym + self.qq_groupsize = qq_groupsize + self.outlier_threshold = outlier_threshold + self.simplified_outliers = simplified_outliers + self.save = save + self.save_safetensors = save_safetensors + self.benchmark = benchmark + self.check = check + self.skip_out_loss = skip_out_loss + self.offload_activations = offload_activations + self.dtype = dtype diff --git a/open_gpt/spqr/quantutils.py b/run_gpt/spqr/quantutils.py similarity index 82% rename from open_gpt/spqr/quantutils.py rename to run_gpt/spqr/quantutils.py index 1c714c6..4113994 100644 --- a/open_gpt/spqr/quantutils.py +++ b/run_gpt/spqr/quantutils.py @@ -1,10 +1,11 @@ # Adapted from https://github.com/Vahe1994/SpQR import time + import torch import torch.nn as nn +from spqr_engine import SPQRUtil from tqdm import trange from transformers import LlamaForCausalLM, LlamaTokenizer -from spqr_engine import SPQRUtil def save_llama(model_name, model, save_directory): @@ -15,10 +16,15 @@ def save_llama(model_name, model, save_directory): def get_llama(model_path): import torch + def skip(*args, **kwargs): pass - saved_inits = torch.nn.init.kaiming_uniform_, torch.nn.init.uniform_, torch.nn.init.normal_ # preserving + saved_inits = ( + torch.nn.init.kaiming_uniform_, + torch.nn.init.uniform_, + torch.nn.init.normal_, + ) # preserving torch.nn.init.kaiming_uniform_ = skip torch.nn.init.uniform_ = skip torch.nn.init.normal_ = skip @@ -27,11 +33,15 @@ def skip(*args, **kwargs): pretrained_model_name_or_path=model_path, local_files_only=True, low_cpu_mem_usage=True, - torch_dtype="auto" + torch_dtype="auto", ) model.seqlen = 2048 - torch.nn.init.kaiming_uniform_, torch.nn.init.uniform_, torch.nn.init.normal_ = saved_inits # restoring + ( + torch.nn.init.kaiming_uniform_, + torch.nn.init.uniform_, + torch.nn.init.normal_, + ) = saved_inits # restoring return model @@ -40,20 +50,24 @@ def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=""): return {name: module} res = {} for name1, child in module.named_children(): - res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1)) + res.update( + find_layers( + child, layers=layers, name=name + "." + name1 if name != "" else name1 + ) + ) return res def get_average_number_of_bits( - wbits: int = 3, - qq_scale_bits: int = 3, - qq_zero_bits: int = 3, - qqq_scale_bits: int = 16, - qqq_zero_bits: int = 16, - groupsize: int = 16, - qq_groupsize: int = 16, - round_zero: bool = False, - global_ol_n_share: float = 0.00, + wbits: int = 3, + qq_scale_bits: int = 3, + qq_zero_bits: int = 3, + qqq_scale_bits: int = 16, + qqq_zero_bits: int = 16, + groupsize: int = 16, + qq_groupsize: int = 16, + round_zero: bool = False, + global_ol_n_share: float = 0.00, ): # if not quantized stats are in full precision qq_scale_bits = qq_scale_bits or 16 @@ -62,11 +76,17 @@ def get_average_number_of_bits( qq_groupsize = qq_groupsize or float('inf') if round_zero: - wbits_avg = wbits + (qq_scale_bits + wbits) / groupsize + (qqq_scale_bits + qqq_zero_bits) / ( - groupsize * qq_groupsize) + wbits_avg = ( + wbits + + (qq_scale_bits + wbits) / groupsize + + (qqq_scale_bits + qqq_zero_bits) / (groupsize * qq_groupsize) + ) else: - wbits_avg = wbits + (qq_scale_bits + qq_zero_bits) / groupsize + 2 * (qqq_scale_bits + qqq_zero_bits) / ( - groupsize * qq_groupsize) + wbits_avg = ( + wbits + + (qq_scale_bits + qq_zero_bits) / groupsize + + 2 * (qqq_scale_bits + qqq_zero_bits) / (groupsize * qq_groupsize) + ) # correct accounting for outliers if global_ol_n_share > 0: @@ -88,7 +108,9 @@ def llama_sequential(model, dataloader, args, dev): layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype - inps = torch.zeros((args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev) + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) cache = {"i": 0, "attention_mask": None} class Catcher(nn.Module): @@ -128,7 +150,9 @@ def forward(self, inp, **kwargs): normal_outlier_count_global, w_count_global = 0, 0 for i in range(len(layers)): - print(f"\n------------------------------------------------------------------\nStarting layer {i}") + print( + f"\n------------------------------------------------------------------\nStarting layer {i}" + ) normal_outlier_count, w_count = 0, 0 stats_payload = {} @@ -185,14 +209,19 @@ def tmp(_, inp, out): simplified_outliers=args.simplified_outliers, ) - gptq[name].layer.weight.data = quantized.weight.to(gptq[name].layer.weight.data.dtype) + gptq[name].layer.weight.data = quantized.weight.to( + gptq[name].layer.weight.data.dtype + ) quantizers["model.layers.%d.%s" % (i, name)] = () # to be updated # OUTLIER STATS per module: - normal_outliers_count = quantized.unstructured_outlier_mask.to(torch.int32).sum() + normal_outliers_count = quantized.unstructured_outlier_mask.to( + torch.int32 + ).sum() - stats_payload[f"n_{name}_ol_share"] = round((normal_outliers_count / quantized.weight.numel()).item(), - 6) + stats_payload[f"n_{name}_ol_share"] = round( + (normal_outliers_count / quantized.weight.numel()).item(), 6 + ) normal_outlier_count += normal_outliers_count.item() w_count += quantized.weight.numel() @@ -246,7 +275,7 @@ def tmp(_, inp, out): args.groupsize, args.qq_groupsize, args.round_zero, - normal_outlier_count_global / w_count_global + normal_outlier_count_global / w_count_global, ) model.config.use_cache = use_cache @@ -268,7 +297,9 @@ def llama_eval(model, testenc, dev): layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype - inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev) + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) cache = {"i": 0, "attention_mask": None} class Catcher(nn.Module): @@ -286,7 +317,7 @@ def forward(self, inp, **kwargs): saved_num_threads = torch.get_num_threads() torch.set_num_threads(min(16, saved_num_threads)) for i in range(nsamples): - batch = testenc[:, (i * model.seqlen): ((i + 1) * model.seqlen)].to(dev) + batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev) try: model(batch) except ValueError: @@ -324,9 +355,11 @@ def forward(self, inp, **kwargs): hidden_states = model.model.norm(hidden_states) lm_logits = model.lm_head(hidden_states) shift_logits = lm_logits[:, :-1, :].contiguous() - shift_labels = testenc[:, (i * model.seqlen): ((i + 1) * model.seqlen)][:, 1:] + shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:] loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) + ) neg_log_likelihood = loss.float() * model.seqlen nlls.append(neg_log_likelihood) ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) diff --git a/open_gpt/spqr/spqr_engine.py b/run_gpt/spqr/spqr_engine.py similarity index 71% rename from open_gpt/spqr/spqr_engine.py rename to run_gpt/spqr/spqr_engine.py index 5ec99b8..a310795 100644 --- a/open_gpt/spqr/spqr_engine.py +++ b/run_gpt/spqr/spqr_engine.py @@ -1,11 +1,13 @@ # Originally from https://github.com/Vahe1994/SpQR from __future__ import annotations + import math -from typing import Optional, NamedTuple, Union +from typing import NamedTuple, Optional, Union + import torch +from quant_groups import Quantizer, quantize from tqdm.auto import tqdm from weight_permutation import get_permutation_order -from quant_groups import Quantizer, quantize class SPQRUtil: @@ -19,7 +21,9 @@ def __init__(self, layer): self.nsamples = 0 def add_batch(self, inp): - assert self.H is not None, "Already ran quantization; cannot add more data batches" + assert ( + self.H is not None + ), "Already ran quantization; cannot add more data batches" if len(inp.shape) == 2: inp = inp.unsqueeze(0) tmp = inp.shape[0] @@ -82,7 +86,9 @@ def quantize( self.H = None H = H[perm][:, perm] - self.dead = torch.diag(H) == 0 # indices of input features that do not affect outputs + self.dead = ( + torch.diag(H) == 0 + ) # indices of input features that do not affect outputs if percdamp > 0: ix = torch.arange(len(H), device=weight.device) H[ix, ix] += percdamp * abs(torch.diag(H)).mean() @@ -96,7 +102,9 @@ def quantize( quantizer = Quantizer() quantizer.configure(bits, perchannel=perchannel, sym=sym, **kwargs) - assert H_inv_cho.shape[0] == H_inv_cho.shape[1] == weight.shape[1], "weight must be [out_features, in_features]" + assert ( + H_inv_cho.shape[0] == H_inv_cho.shape[1] == weight.shape[1] + ), "weight must be [out_features, in_features]" out_dim, in_dim = weight.shape # [out_features, in_features] if groupsize is None: @@ -106,15 +114,21 @@ def quantize( outlier_column_indices = torch.empty(0, dtype=torch.int64, device=weight.device) del H_inv - outlier_scale = (weight.var(dim=0) / torch.diag(H_inv_cho).square()).mean().item() + outlier_scale = ( + (weight.var(dim=0) / torch.diag(H_inv_cho).square()).mean().item() + ) unstructured_outlier_threshold = outlier_relative_threshold * outlier_scale - in_group_index = -1 # index of current group of input features, for group quantizer purposes + in_group_index = ( + -1 + ) # index of current group of input features, for group quantizer purposes quantization_errors = torch.zeros_like(weight) unstructured_outlier_mask = torch.zeros_like(weight, dtype=torch.bool) block_start_iter = range(0, in_dim - keep_last_columns, blocksize) - block_start_iter = tqdm(block_start_iter, leave=False) if verbose else block_start_iter + block_start_iter = ( + tqdm(block_start_iter, leave=False) if verbose else block_start_iter + ) for block_start in block_start_iter: block_end = min(block_start + blocksize, in_dim) for column_index in range(block_start, block_end): @@ -123,57 +137,90 @@ def quantize( in_group_index += 1 group_weight = weight[:, column_index : column_index + groupsize] - if simplified_outliers or (unstructured_outlier_threshold == float("inf")): + if simplified_outliers or ( + unstructured_outlier_threshold == float("inf") + ): quantizer.find_params(group_weight, weight=True) else: # objective: detect which weights will be designated as outliers, fit quantizer *without* these weights # step 1: fit quantizer on a leave-one-out version of weights, i.e. in each group, drop one weight at a time - assert perchannel, "refitting quantizer is only implemented for perchannel=True" - group_diag_hessian_inv_cho = H_inv_cho_diag[column_index : column_index + groupsize] + assert ( + perchannel + ), "refitting quantizer is only implemented for perchannel=True" + group_diag_hessian_inv_cho = H_inv_cho_diag[ + column_index : column_index + groupsize + ] loo_quantization_error_sq = get_leave_one_out_error( group_weight, group_diag_hessian_inv_cho, bits=bits, sym=sym ) # ^-- dequantized(quantized(group_weight)) using a quantizer trained on all weights except the reconstructed one - likely_unstructured_outlier_mask = (loo_quantization_error_sq > unstructured_outlier_threshold).float() + likely_unstructured_outlier_mask = ( + loo_quantization_error_sq > unstructured_outlier_threshold + ).float() non_outlier_mask = 1 - likely_unstructured_outlier_mask - mean_over_non_outliers = torch.sum(group_weight * non_outlier_mask, dim=1, keepdim=True) / torch.sum( - non_outlier_mask, dim=1, keepdim=True - ).clamp_min(1) - group_weight_without_outliers = group_weight * non_outlier_mask + mean_over_non_outliers * ( - 1 - non_outlier_mask + mean_over_non_outliers = torch.sum( + group_weight * non_outlier_mask, dim=1, keepdim=True + ) / torch.sum(non_outlier_mask, dim=1, keepdim=True).clamp_min( + 1 + ) + group_weight_without_outliers = ( + group_weight * non_outlier_mask + + mean_over_non_outliers * (1 - non_outlier_mask) + ) + quantizer.find_params( + group_weight_without_outliers, weight=True ) - quantizer.find_params(group_weight_without_outliers, weight=True) del group_diag_hessian_inv_cho, loo_quantization_error_sq - del mean_over_non_outliers, group_weight_without_outliers, non_outlier_mask + del ( + mean_over_non_outliers, + group_weight_without_outliers, + non_outlier_mask, + ) del group_weight weight_i_quantized = quantize( - weight[:, column_index].unsqueeze(1), quantizer.scale, quantizer.zero, quantizer.maxq + weight[:, column_index].unsqueeze(1), + quantizer.scale, + quantizer.zero, + quantizer.maxq, ).reshape_as(weight[:, column_index]) - delta_weight_i = weight[:, column_index] - weight_i_quantized # [out_dim] - quantization_errors[:, column_index] = delta_weight_i / H_inv_cho[column_index, column_index] # [out_dim] + delta_weight_i = ( + weight[:, column_index] - weight_i_quantized + ) # [out_dim] + quantization_errors[:, column_index] = ( + delta_weight_i / H_inv_cho[column_index, column_index] + ) # [out_dim] if unstructured_outlier_threshold != float("inf"): unstructured_outlier_mask[:, column_index] = ( - quantization_errors[:, column_index].square() > unstructured_outlier_threshold + quantization_errors[:, column_index].square() + > unstructured_outlier_threshold ) # re-quantize without outliers is_outlier = unstructured_outlier_mask[:, column_index].float() weight_i_quantized_wo_outliers = quantize( - (weight[:, column_index] * (1 - is_outlier)).unsqueeze(1), quantizer.scale, quantizer.zero, quantizer.maxq + (weight[:, column_index] * (1 - is_outlier)).unsqueeze(1), + quantizer.scale, + quantizer.zero, + quantizer.maxq, ).reshape_as(weight[:, column_index]) weight_i_quantized = ( - weight_i_quantized_wo_outliers * (1 - is_outlier) + weight[:, column_index] * is_outlier + weight_i_quantized_wo_outliers * (1 - is_outlier) + + weight[:, column_index] * is_outlier ) # [out_dim] del weight_i_quantized_wo_outliers - delta_weight_i = weight[:, column_index] - weight_i_quantized # [out_dim] - quantization_errors[:, column_index] = delta_weight_i / H_inv_cho[column_index, column_index] # [out_dim] + delta_weight_i = ( + weight[:, column_index] - weight_i_quantized + ) # [out_dim] + quantization_errors[:, column_index] = ( + delta_weight_i / H_inv_cho[column_index, column_index] + ) # [out_dim] weight[:, column_index] = weight_i_quantized weight[:, column_index + 1 : block_end].addr_( @@ -205,7 +252,9 @@ class QuantizationResult(NamedTuple): """A collection of codebooks, indices and assorted statistics produced by SPQRUtil; not memory-optimized!""" weight: torch.FloatTensor # dequantized(quantized(weight)), same shape as the original - perm: Optional[torch.LongTensor] # optional input permutation indices that were used during quantization + perm: Optional[ + torch.LongTensor + ] # optional input permutation indices that were used during quantization # NOTE: if permutation_order != identity, all subsequent tensors (incl. outlier indices) are permuted in that order! quantization_errors: torch.Tensor # per-element quantization errors, defined as (weight - quantized_weight) / diag(inverse_hessian_cholesky) @@ -213,27 +262,46 @@ class QuantizationResult(NamedTuple): unstructured_outlier_mask: torch.Tensor # bool mask where True means that this is an individual outlier -def get_leave_one_out_error(group_weight: torch.Tensor, group_diag_hessian_inv_cho: torch.Tensor, *, bits, sym): +def get_leave_one_out_error( + group_weight: torch.Tensor, group_diag_hessian_inv_cho: torch.Tensor, *, bits, sym +): """EXPERIMENTAL! BEWARE - for each weight, fit quantizer without this_one_weight and return this one weight's reconstruction""" assert group_weight.ndim == 2 loo_indices = torch.arange(group_weight.shape[1], device=group_weight.device) - loo_indices = loo_indices[1:] - (loo_indices[:, None] >= loo_indices[1:]).to(loo_indices.dtype) - groupwise_loo_data = group_weight[:, loo_indices] # [num_groups, num_loo = groupsize, groupsize - 1] + loo_indices = loo_indices[1:] - (loo_indices[:, None] >= loo_indices[1:]).to( + loo_indices.dtype + ) + groupwise_loo_data = group_weight[ + :, loo_indices + ] # [num_groups, num_loo = groupsize, groupsize - 1] fast_quantizer = Quantizer(shape=groupwise_loo_data.flatten(0, 1).shape) fast_quantizer.configure(bits, perchannel=True, sym=sym) fast_quantizer.find_params(groupwise_loo_data.flatten(0, 1), weight=True) # compute error improvement from not quantizing each one weight # to do so, we shall first train quantizer on leave-one-out data (which can be done faster since not all data affects quantization) - loo_groupwise_reconstructed_weights = fast_quantizer.quantize(groupwise_loo_data.flatten(0, 1)).reshape_as(groupwise_loo_data) - loo_group_diag_hessian_inv_cho = group_diag_hessian_inv_cho[loo_indices] # [num_loo = groupsize, groupsize - 1] + loo_groupwise_reconstructed_weights = fast_quantizer.quantize( + groupwise_loo_data.flatten(0, 1) + ).reshape_as(groupwise_loo_data) + loo_group_diag_hessian_inv_cho = group_diag_hessian_inv_cho[ + loo_indices + ] # [num_loo = groupsize, groupsize - 1] assert group_diag_hessian_inv_cho.ndim == 1 # total quantization error consists of hessian-weighted mse on all remaining weights except for the one that's left out # -- this is because the left-out weights will not be quantized, and therefore, has zero quantization error - loo_errors_sq = ((loo_groupwise_reconstructed_weights - groupwise_loo_data) / loo_group_diag_hessian_inv_cho).square().sum(-1) - assert loo_errors_sq.shape == group_weight.shape # [num_groups, num_loo = groupsize] + loo_errors_sq = ( + ( + (loo_groupwise_reconstructed_weights - groupwise_loo_data) + / loo_group_diag_hessian_inv_cho + ) + .square() + .sum(-1) + ) + assert ( + loo_errors_sq.shape == group_weight.shape + ) # [num_groups, num_loo = groupsize] # as a baseline error, quantize data normally without outliers base_quantizer = Quantizer(shape=group_weight.shape) @@ -241,7 +309,9 @@ def get_leave_one_out_error(group_weight: torch.Tensor, group_diag_hessian_inv_c base_quantizer.find_params(group_weight, weight=True) baseline_reconstructed_weights = base_quantizer.quantize(group_weight) baseline_errors_sq = ( - ((baseline_reconstructed_weights - group_weight) / group_diag_hessian_inv_cho).square().sum(dim=1, keepdim=True) + ((baseline_reconstructed_weights - group_weight) / group_diag_hessian_inv_cho) + .square() + .sum(dim=1, keepdim=True) ) # outlier's usefulness = how much does mse decrease from treating this weight as an outlier diff --git a/open_gpt/spqr/test.py b/run_gpt/spqr/test.py similarity index 100% rename from open_gpt/spqr/test.py rename to run_gpt/spqr/test.py diff --git a/open_gpt/spqr/weight_permutation.py b/run_gpt/spqr/weight_permutation.py similarity index 93% rename from open_gpt/spqr/weight_permutation.py rename to run_gpt/spqr/weight_permutation.py index 583c6b2..7fc0961 100644 --- a/open_gpt/spqr/weight_permutation.py +++ b/run_gpt/spqr/weight_permutation.py @@ -25,7 +25,12 @@ def find_greedy_nearest_indices(weight: torch.Tensor, use_abs: bool = False): return permutation -def get_permutation_order(H: torch.Tensor, W: torch.Tensor, permutation_order: str = "identity", use_abs: bool = False): +def get_permutation_order( + H: torch.Tensor, + W: torch.Tensor, + permutation_order: str = "identity", + use_abs: bool = False, +): """ Permutation order for layer weights. :param H: Hessian of Weights diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 8df172f..0beaafe 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -1,5 +1,5 @@ -import open_gpt -from open_gpt.profile import LLMMeasure, end_measure, log_measures, start_measure +import run_gpt +from run_gpt.profile import LLMMeasure, end_measure, log_measures, start_measure PROMPT = 'The goal of life is' @@ -21,11 +21,11 @@ def main(args): print(f"===> start model loading ...") model_load_start = start_measure() if args.precision == 'fp16': - model = open_gpt.create_model( + model = run_gpt.create_model( args.model_name, precision='fp16', device_map=args.device_map ) else: - model = open_gpt.create_model( + model = run_gpt.create_model( args.model_name, precision=args.precision, adapter_name_or_path=args.adapter_name, @@ -54,7 +54,7 @@ def main(args): if __name__ == '__main__': import argparse - parser = argparse.ArgumentParser(description='Benchmark for open_gpt.') + parser = argparse.ArgumentParser(description='Benchmark for run_gpt.') parser.add_argument( '--model-name', type=str, diff --git a/setup.py b/setup.py index 27b9551..6c5721e 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,11 @@ from setuptools import find_packages, setup if sys.version_info < (3, 8, 0): - raise OSError(f'OpenGPT requires Python >=3.8, but yours is {sys.version}') + raise OSError(f'RunGPT requires Python >=3.8, but yours is {sys.version}') try: - pkg_name = 'open-gpt-torch' - libinfo_py = path.join(path.dirname(__file__), 'open_gpt', '__init__.py') + pkg_name = 'run-gpt-torch' + libinfo_py = path.join(path.dirname(__file__), 'run_gpt', '__init__.py') libinfo_content = open(libinfo_py, 'r', encoding='utf8').readlines() version_line = [l.strip() for l in libinfo_content if l.startswith('__version__')][ 0 @@ -33,8 +33,8 @@ author='Jina AI', author_email='hello@jina.ai', license='Apache 2.0', - url='https://https://github.com/jina-ai/opengpt', - download_url='https://https://github.com/jina-ai/opengpt/tags', + url='https://https://github.com/jina-ai/rungpt', + download_url='https://https://github.com/jina-ai/rungpt/tags', long_description=_long_description, long_description_content_type='text/markdown', zip_safe=False, @@ -62,7 +62,7 @@ }, entry_points={ 'console_scripts': [ - 'opengpt = open_gpt.cli.application:main', + 'rungpt = run_gpt.cli.application:main', ], }, classifiers=[ @@ -84,9 +84,9 @@ "Topic :: Software Development :: Libraries :: Python Modules", ], project_urls={ - 'Documentation': 'https://opengpt.jina.ai', - 'Source': 'https://github.com/jina-ai/opengpt/', - 'Tracker': 'https://github.com/jina-ai/opengpt/issues', + 'Documentation': 'https://rungpt.jina.ai', + 'Source': 'https://github.com/jina-ai/rungpt/', + 'Tracker': 'https://github.com/jina-ai/rungpt/issues', }, keywords=[ "jina",