diff --git a/docs/en/annotator_entries/AutoGGUFVisionModel.md b/docs/en/annotator_entries/AutoGGUFVisionModel.md new file mode 100644 index 00000000000000..0d6a6c086eabc3 --- /dev/null +++ b/docs/en/annotator_entries/AutoGGUFVisionModel.md @@ -0,0 +1,202 @@ +{%- capture title -%} +AutoGGUFVisionModel +{%- endcapture -%} + +{%- capture description -%} +Multimodal annotator that uses the llama.cpp library to generate text completions with large +language models. It supports ingesting images for captioning. + +At the moment only CLIP based models are supported. + +For settable parameters, and their explanations, see HasLlamaCppInferenceProperties, +HasLlamaCppModelProperties and refer to the llama.cpp documentation of +[server.cpp](https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server) +for more information. + +If the parameters are not set, the annotator will default to use the parameters provided by +the model. + +This annotator expects a column of annotator type AnnotationImage for the image and +Annotation for the caption. Note that the image bytes in the image annotation need to be +raw image bytes without preprocessing. We provide the helper function +ImageAssembler.loadImagesAsBytes to load the image bytes from a directory. + +Pretrained models can be loaded with `pretrained` of the companion object: + +```scala +val autoGGUFVisionModel = AutoGGUFVisionModel.pretrained() + .setInputCols("image", "document") + .setOutputCol("completions") +``` + +The default model is `"llava_v1.5_7b_Q4_0_gguf"`, if no name is provided. + +For available pretrained models please see the [Models Hub](https://sparknlp.org/models). + +For extended examples of usage, see the +[AutoGGUFVisionModelTest](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTest.scala) +and the +[example notebook](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFVisionModel.ipynb). + +**Note**: To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set +the number of GPU layers with the `setNGpuLayers` method. + +When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` +according to your hardware to avoid out-of-memory errors. +{%- endcapture -%} + +{%- capture input_anno -%} +IMAGE, DOCUMENT +{%- endcapture -%} + +{%- capture output_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture python_example -%} +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline +from pyspark.sql.functions import lit + +documentAssembler = DocumentAssembler() \ + .setInputCol("caption") \ + .setOutputCol("caption_document") +imageAssembler = ImageAssembler() \ + .setInputCol("image") \ + .setOutputCol("image_assembler") + +imagesPath = "src/test/resources/image/" +data = ImageAssembler \ + .loadImagesAsBytes(spark, imagesPath) \ + .withColumn("caption", lit("Caption this image.")) # Add a caption to each image. + +nPredict = 40 +model = AutoGGUFVisionModel.pretrained() \ + .setInputCols(["caption_document", "image_assembler"]) \ + .setOutputCol("completions") \ + .setBatchSize(4) \ + .setNGpuLayers(99) \ + .setNCtx(4096) \ + .setMinKeep(0) \ + .setMinP(0.05) \ + .setNPredict(nPredict) \ + .setNProbs(0) \ + .setPenalizeNl(False) \ + .setRepeatLastN(256) \ + .setRepeatPenalty(1.18) \ + .setStopStrings(["", "Llama:", "User:"]) \ + .setTemperature(0.05) \ + .setTfsZ(1) \ + .setTypicalP(1) \ + .setTopK(40) \ + .setTopP(0.95) + +pipeline = Pipeline().setStages([documentAssembler, imageAssembler, model]) +pipeline.fit(data).transform(data) \ + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "completions.result") \ + .show(truncate = False) ++-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|image_name |result | ++-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|palace.JPEG |[ The image depicts a large, ornate room with high ceilings and beautifully decorated walls. There are several chairs placed throughout the space, some of which have cushions] | +|egyptian_cat.jpeg|[ The image features two cats lying on a pink surface, possibly a bed or sofa. One cat is positioned towards the left side of the scene and appears to be sleeping while holding] | +|hippopotamus.JPEG|[ A large brown hippo is swimming in a body of water, possibly an aquarium. The hippo appears to be enjoying its time in the water and seems relaxed as it floats] | +|hen.JPEG |[ The image features a large chicken standing next to several baby chickens. In total, there are five birds in the scene: one adult and four young ones. They appear to be gathered together] | +|ostrich.JPEG |[ The image features a large, long-necked bird standing in the grass. It appears to be an ostrich or similar species with its head held high and looking around. In addition to] | +|junco.JPEG |[ A small bird with a black head and white chest is standing on the snow. It appears to be looking at something, possibly food or another animal in its vicinity. The scene takes place out] | +|bluetick.jpg |[ A dog with a red collar is sitting on the floor, looking at something. The dog appears to be staring into the distance or focusing its attention on an object in front of it.] | +|chihuahua.jpg |[ A small brown dog wearing a sweater is sitting on the floor. The dog appears to be looking at something, possibly its owner or another animal in the room. It seems comfortable and relaxed]| +|tractor.JPEG |[ A man is sitting in the driver's seat of a green tractor, which has yellow wheels and tires. The tractor appears to be parked on top of an empty field with] | +|ox.JPEG |[ A large bull with horns is standing in a grassy field.] | ++-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture scala_example -%} +import com.johnsnowlabs.nlp.ImageAssembler +import com.johnsnowlabs.nlp.annotator._ +import com.johnsnowlabs.nlp.base._ +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.lit + +val documentAssembler = new DocumentAssembler() + .setInputCol("caption") + .setOutputCol("caption_document") + +val imageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + +val imagesPath = "src/test/resources/image/" +val data: DataFrame = ImageAssembler + .loadImagesAsBytes(ResourceHelper.spark, imagesPath) + .withColumn("caption", lit("Caption this image.")) // Add a caption to each image. + +val nPredict = 40 +val model = AutoGGUFVisionModel.pretrained() + .setInputCols("caption_document", "image_assembler") + .setOutputCol("completions") + .setBatchSize(4) + .setNGpuLayers(99) + .setNCtx(4096) + .setMinKeep(0) + .setMinP(0.05f) + .setNPredict(nPredict) + .setNProbs(0) + .setPenalizeNl(false) + .setRepeatLastN(256) + .setRepeatPenalty(1.18f) + .setStopStrings(Array("", "Llama:", "User:")) + .setTemperature(0.05f) + .setTfsZ(1) + .setTypicalP(1) + .setTopK(40) + .setTopP(0.95f) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, imageAssembler, model)) +pipeline + .fit(data) + .transform(data) + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "completions.result") + .show(truncate = false) ++-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|image_name |result | ++-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|palace.JPEG |[ The image depicts a large, ornate room with high ceilings and beautifully decorated walls. There are several chairs placed throughout the space, some of which have cushions] | +|egyptian_cat.jpeg|[ The image features two cats lying on a pink surface, possibly a bed or sofa. One cat is positioned towards the left side of the scene and appears to be sleeping while holding] | +|hippopotamus.JPEG|[ A large brown hippo is swimming in a body of water, possibly an aquarium. The hippo appears to be enjoying its time in the water and seems relaxed as it floats] | +|hen.JPEG |[ The image features a large chicken standing next to several baby chickens. In total, there are five birds in the scene: one adult and four young ones. They appear to be gathered together] | +|ostrich.JPEG |[ The image features a large, long-necked bird standing in the grass. It appears to be an ostrich or similar species with its head held high and looking around. In addition to] | +|junco.JPEG |[ A small bird with a black head and white chest is standing on the snow. It appears to be looking at something, possibly food or another animal in its vicinity. The scene takes place out] | +|bluetick.jpg |[ A dog with a red collar is sitting on the floor, looking at something. The dog appears to be staring into the distance or focusing its attention on an object in front of it.] | +|chihuahua.jpg |[ A small brown dog wearing a sweater is sitting on the floor. The dog appears to be looking at something, possibly its owner or another animal in the room. It seems comfortable and relaxed]| +|tractor.JPEG |[ A man is sitting in the driver's seat of a green tractor, which has yellow wheels and tires. The tractor appears to be parked on top of an empty field with] | +|ox.JPEG |[ A large bull with horns is standing in a grassy field.] | ++-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture api_link -%} +[AutoGGUFVisionModel](/api/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel) +{%- endcapture -%} + +{%- capture python_api_link -%} +[AutoGGUFVisionModel](/api/python/reference/autosummary/sparknlp/annotator/seq2seq/auto_gguf_vision_model/index.html) +{%- endcapture -%} + +{%- capture source_link -%} +[AutoGGUFVisionModel](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala) +{%- endcapture -%} + +{% include templates/anno_template.md +title=title +description=description +input_anno=input_anno +output_anno=output_anno +python_example=python_example +scala_example=scala_example +api_link=api_link +python_api_link=python_api_link +source_link=source_link +%} \ No newline at end of file diff --git a/docs/en/annotators.md b/docs/en/annotators.md index c5c21707b80f8e..541d151533c3ce 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -47,6 +47,7 @@ There are two types of Annotators: |---|---|---| {% include templates/anno_table_entry.md path="" name="AutoGGUFEmbeddings" summary="Annotator that uses the llama.cpp library to generate text embeddings with large language models."%} {% include templates/anno_table_entry.md path="" name="AutoGGUFModel" summary="Annotator that uses the llama.cpp library to generate text completions with large language models."%} +{% include templates/anno_table_entry.md path="" name="AutoGGUFVisionModel" summary="Multimodal annotator that uses the llama.cpp library to generate text completions with large language models."%} {% include templates/anno_table_entry.md path="" name="BGEEmbeddings" summary="Sentence embeddings using BGE."%} {% include templates/anno_table_entry.md path="" name="BigTextMatcher" summary="Annotator to match exact phrases (by token) provided in a file against a Document."%} {% include templates/anno_table_entry.md path="" name="Chunk2Doc" summary="Converts a `CHUNK` type column back into `DOCUMENT`. Useful when trying to re-tokenize or do further analysis on a `CHUNK` result."%} diff --git a/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb index d4152e51194c25..8d00e9d3b1a291 100644 --- a/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb +++ b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb @@ -264,8 +264,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb index 3a76bdf5f01ece..09be6b85ee1083 100644 --- a/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb +++ b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -320,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -335,7 +335,6 @@ "source": [ "from sparknlp.annotator import *\n", "\n", - "# All these params should be identical to the original ONNX model\n", "autoGGUFModel = (\n", " AutoGGUFModel.loadSavedModel(EXPORT_PATH, spark)\n", " .setInputCols(\"document\")\n", @@ -355,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -389,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -415,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -619,8 +618,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFVisionModel.ipynb b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFVisionModel.ipynb new file mode 100644 index 00000000000000..a33d9c351ba094 --- /dev/null +++ b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFVisionModel.ipynb @@ -0,0 +1,805 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFVisionModel.ipynb)\n", + "\n", + "# Import llama.cpp 🦙 vision models into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- Multimodal inference with llama.cpp was introduced in `Spark NLP 5.6.0`, enabling quantized LLM inference on a wide range of devices. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You need to use your own `.gguf` model files, which also include the models from the [Hugging Face Models](https://huggingface.co/models?library=gguf).", + "- At the moment only CLIP based models are supported." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download a GGUF Vision Model\n", + "\n", + "Let's download a GGUF vision model to test it out. For this, we will use [Mozilla/llava-v1.5-7b](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/tree/main). It is a 7B parameter model which also is available in 4-bit quantization.\n", + "\n", + "We can download the model and its multimodal projection (mmproj) file by selecting the q4 GGUF file from the \"Files and versions\" tab.\n", + "\n", + "Once downloaded, we can directly import this model into Spark NLP!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "EXPORT_PATH_MODEL = \"llava-v1.5-7b-Q4_K.gguf\"\n", + "EXPORT_PATH_MMPROJ = \"llava-v1.5-7b-mmproj-Q4_0.gguf\"\n", + "! wget \"https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/{EXPORT_PATH_MODEL}?download=true\" -O {EXPORT_PATH_MODEL}\n", + "! wget \"https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/{EXPORT_PATH_MMPROJ}?download=true\" -O {EXPORT_PATH_MMPROJ}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save AutGGUFVision models in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP (if running it Google Colab)\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only execute this if you are on Google Colab\n", + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP with GPU enabled. If you don't have GPUs available remove this parameter.\n", + "spark = sparknlp.start(gpu=True)\n", + "print(sparknlp.version())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use the `loadSavedModel` function in `AutoGGUFVisionModel`\n", + "- Most parameters will be set automatically. They can also be set later after loading the model in `AutoGGUFVisionModel` during runtime, so don't worry about setting them now.\n", + "- `loadSavedModel` accepts three parameters: \n", + " 1. the path to the exported gguf model\n", + " 1. the path to the exported mmproj gguf model\n", + " 2. the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "autoGGUFModel = (\n", + " AutoGGUFVisionModel.loadSavedModel(EXPORT_PATH_MODEL, EXPORT_PATH_MMPROJ, spark)\n", + " .setInputCols([\"caption_document\", \"image_assembler\"])\n", + " .setOutputCol(\"completions\")\n", + " .setChatTemplate(\"vicuna\")\n", + " .setBatchSize(4)\n", + " .setNGpuLayers(99)\n", + " .setNCtx(4096)\n", + " .setMinKeep(0)\n", + " .setMinP(0.05)\n", + " .setNPredict(40)\n", + " .setNProbs(0)\n", + " .setPenalizeNl(False)\n", + " .setRepeatLastN(256)\n", + " .setRepeatPenalty(1.18)\n", + " .setStopStrings([\"\", \"Llama:\", \"User:\"])\n", + " .setTemperature(0.05)\n", + " .setTfsZ(1)\n", + " .setTypicalP(1)\n", + " .setTopK(40)\n", + " .setTopP(0.95)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "autoGGUFModel.write().overwrite().save(f\"llava_v1.5_7b_Q4_0_gguf_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your GGUF model from loaded and saved by Spark NLP 🚀. You can now use it on other machines, clusters, or any place you wish to use your new and shiny GGUF model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "llava-v1.5-7b-mmproj-Q4_0.gguf\tllava-v1.5-7b-Q4_K.gguf metadata\n" + ] + } + ], + "source": [ + "! ls llava_v1.5_7b_Q4_0_gguf_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example: Captioning Images\n", + "\n", + "Now let's see how we can use the model to caption some images. Let's first download some images we can caption." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "!wget -q https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/images/images.zip\n", + "import shutil\n", + "shutil.unpack_archive(\"images.zip\", \"images\", \"zip\")\n", + "\n", + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "import os\n", + "\n", + "_, axes = plt.subplots(2, 5, figsize=(10,5))\n", + "axes = axes.flatten()\n", + "\n", + "i = 0\n", + "images_path = \"images/images/\"\n", + "for file_name in os.listdir(images_path):\n", + " if file_name.lower().endswith((\".png\", \".jpg\", \".jpeg\", \".gif\")):\n", + " file_path = os.path.join(\"images/images/\", file_name)\n", + " ax = axes[i]\n", + " ax.imshow(Image.open(file_path).convert(\"RGB\"))\n", + " ax.title.set_text(file_name)\n", + " ax.axis(\"off\")\n", + " i += 1\n", + "\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can load the images to Spark.\n", + "\n", + "**NOTE**: The llama.cpp backend of the annotator expects a different image byte format than the default format used by Spark. This annotator expects *raw* image bytes, instead of the OpenCV image compatible format, which is used by default.\n", + "\n", + "For this, we can use the helper function `loadImagesAsBytes` from the `ImageAssembler`. It will load the images in the right format in a Spark DataFrame. Additionally, we will add a column for the caption:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.base import *\n", + "from pyspark.sql.functions import lit\n", + "\n", + "data = ImageAssembler.loadImagesAsBytes(spark, images_path)\n", + "# Add a caption to each image.\n", + "data = data.withColumn(\"caption\", lit(\"Caption this image.\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now We need an `ImageAssembler` and `DocumentAssembler` to turn the images and captions into the right format for Spark NLP. We also load the model we just saved above. Then we can assemble a pipeline and run it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/18 13:46:33 WARN DAGScheduler: Broadcasting large task binary with size 1090.9 KiB\n", + "clip_model_load: model name: openai/clip-vit-large-patch14-336 (0 + 1) / 1]\n", + "clip_model_load: description: image encoder for LLaVA\n", + "clip_model_load: GGUF version: 3\n", + "clip_model_load: alignment: 32\n", + "clip_model_load: n_tensors: 377\n", + "clip_model_load: n_kv: 19\n", + "clip_model_load: ftype: q4_0\n", + "\n", + "clip_model_load: loaded meta data with 19 key-value pairs and 377 tensors from /tmp/spark-5acddb2b-4bca-474e-befd-d8613d27a78e/userFiles-4926735e-f265-46bc-8a9f-9edb6a65484e/llava-v1.5-7b-mmproj-Q4_0.gguf\n", + "clip_model_load: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", + "clip_model_load: - kv 0: general.architecture str = clip\n", + "clip_model_load: - kv 1: clip.has_text_encoder bool = false\n", + "clip_model_load: - kv 2: clip.has_vision_encoder bool = true\n", + "clip_model_load: - kv 3: clip.has_llava_projector bool = true\n", + "clip_model_load: - kv 4: general.file_type u32 = 2\n", + "clip_model_load: - kv 5: general.name str = openai/clip-vit-large-patch14-336\n", + "clip_model_load: - kv 6: general.description str = image encoder for LLaVA\n", + "clip_model_load: - kv 7: clip.vision.image_size u32 = 336\n", + "clip_model_load: - kv 8: clip.vision.patch_size u32 = 14\n", + "clip_model_load: - kv 9: clip.vision.embedding_length u32 = 1024\n", + "clip_model_load: - kv 10: clip.vision.feed_forward_length u32 = 4096\n", + "clip_model_load: - kv 11: clip.vision.projection_dim u32 = 768\n", + "clip_model_load: - kv 12: clip.vision.attention.head_count u32 = 16\n", + "clip_model_load: - kv 13: clip.vision.attention.layer_norm_epsilon f32 = 0.000010\n", + "clip_model_load: - kv 14: clip.vision.block_count u32 = 23\n", + "clip_model_load: - kv 15: clip.vision.image_mean arr[f32,3] = [0.481455, 0.457828, 0.408211]\n", + "clip_model_load: - kv 16: clip.vision.image_std arr[f32,3] = [0.268630, 0.261303, 0.275777]\n", + "clip_model_load: - kv 17: clip.use_gelu bool = false\n", + "clip_model_load: - kv 18: general.quantization_version u32 = 2\n", + "clip_model_load: - type f32: 235 tensors\n", + "clip_model_load: - type f16: 1 tensors\n", + "clip_model_load: - type q4_0: 141 tensors\n", + "ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no\n", + "ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no\n", + "ggml_cuda_init: found 1 CUDA devices:\n", + " Device 0: NVIDIA GeForce RTX 3070, compute capability 8.6, VMM: yes\n", + "clip_model_load: CLIP using CUDA backend\n", + "clip_model_load: text_encoder: 0\n", + "clip_model_load: vision_encoder: 1\n", + "clip_model_load: llava_projector: 1\n", + "clip_model_load: model size: 169.18 MB\n", + "clip_model_load: metadata size: 0.13 MB\n", + "clip_model_load: params backend buffer size = 169.18 MB (377 tensors)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] build info build=3534 commit=\"641f5dd2\"\n", + "[INFO] system info n_threads=6 n_threads_batch=-1 total_threads=6 system_info=\"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \"\n", + "[INFO] Multi Modal Mode Enabled\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "key clip.vision.image_grid_pinpoints not found in file\n", + "key clip.vision.mm_patch_merge_type not found in file\n", + "key clip.vision.image_crop_resolution not found in file\n", + "ggml_gallocr_reserve_n: reallocating CUDA0 buffer from size 0.00 MiB to 32.89 MiB\n", + "clip_model_load: compute allocated memory: 32.89 MB\n", + "llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /tmp/spark-5acddb2b-4bca-474e-befd-d8613d27a78e/userFiles-4926735e-f265-46bc-8a9f-9edb6a65484e/llava-v1.5-7b-Q4_K.gguf (version GGUF V3 (latest))\n", + "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", + "llama_model_loader: - kv 0: general.architecture str = llama\n", + "llama_model_loader: - kv 1: general.name str = LLaMA v2\n", + "llama_model_loader: - kv 2: llama.context_length u32 = 4096\n", + "llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n", + "llama_model_loader: - kv 4: llama.block_count u32 = 32\n", + "llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008\n", + "llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n", + "llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n", + "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32\n", + "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n", + "llama_model_loader: - kv 10: general.file_type u32 = 15\n", + "llama_model_loader: - kv 11: tokenizer.ggml.model str = llama\n", + "llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n", + "llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n", + "llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n", + "llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1\n", + "llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2\n", + "llama_model_loader: - kv 17: tokenizer.ggml.padding_token_id u32 = 0\n", + "llama_model_loader: - kv 18: general.quantization_version u32 = 2\n", + "llama_model_loader: - type f32: 65 tensors\n", + "llama_model_loader: - type q4_K: 193 tensors\n", + "llama_model_loader: - type q6_K: 33 tensors\n", + "llm_load_vocab: special tokens cache size = 3\n", + "llm_load_vocab: token to piece cache size = 0.1684 MB\n", + "llm_load_print_meta: format = GGUF V3 (latest)\n", + "llm_load_print_meta: arch = llama\n", + "llm_load_print_meta: vocab type = SPM\n", + "llm_load_print_meta: n_vocab = 32000\n", + "llm_load_print_meta: n_merges = 0\n", + "llm_load_print_meta: vocab_only = 0\n", + "llm_load_print_meta: n_ctx_train = 4096\n", + "llm_load_print_meta: n_embd = 4096\n", + "llm_load_print_meta: n_layer = 32\n", + "llm_load_print_meta: n_head = 32\n", + "llm_load_print_meta: n_head_kv = 32\n", + "llm_load_print_meta: n_rot = 128\n", + "llm_load_print_meta: n_swa = 0\n", + "llm_load_print_meta: n_embd_head_k = 128\n", + "llm_load_print_meta: n_embd_head_v = 128\n", + "llm_load_print_meta: n_gqa = 1\n", + "llm_load_print_meta: n_embd_k_gqa = 4096\n", + "llm_load_print_meta: n_embd_v_gqa = 4096\n", + "llm_load_print_meta: f_norm_eps = 0.0e+00\n", + "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", + "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", + "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", + "llm_load_print_meta: f_logit_scale = 0.0e+00\n", + "llm_load_print_meta: n_ff = 11008\n", + "llm_load_print_meta: n_expert = 0\n", + "llm_load_print_meta: n_expert_used = 0\n", + "llm_load_print_meta: causal attn = 1\n", + "llm_load_print_meta: pooling type = 0\n", + "llm_load_print_meta: rope type = 0\n", + "llm_load_print_meta: rope scaling = linear\n", + "llm_load_print_meta: freq_base_train = 10000.0\n", + "llm_load_print_meta: freq_scale_train = 1\n", + "llm_load_print_meta: n_ctx_orig_yarn = 4096\n", + "llm_load_print_meta: rope_finetuned = unknown\n", + "llm_load_print_meta: ssm_d_conv = 0\n", + "llm_load_print_meta: ssm_d_inner = 0\n", + "llm_load_print_meta: ssm_d_state = 0\n", + "llm_load_print_meta: ssm_dt_rank = 0\n", + "llm_load_print_meta: model type = 7B\n", + "llm_load_print_meta: model ftype = Q4_K - Medium\n", + "llm_load_print_meta: model params = 6.74 B\n", + "llm_load_print_meta: model size = 3.80 GiB (4.84 BPW) \n", + "llm_load_print_meta: general.name = LLaMA v2\n", + "llm_load_print_meta: BOS token = 1 ''\n", + "llm_load_print_meta: EOS token = 2 ''\n", + "llm_load_print_meta: UNK token = 0 ''\n", + "llm_load_print_meta: PAD token = 0 ''\n", + "llm_load_print_meta: LF token = 13 '<0x0A>'\n", + "llm_load_print_meta: max token length = 48\n", + "llm_load_tensors: ggml ctx size = 0.27 MiB\n", + "llm_load_tensors: offloading 32 repeating layers to GPU\n", + "llm_load_tensors: offloading non-repeating layers to GPU\n", + "llm_load_tensors: offloaded 33/33 layers to GPU\n", + "llm_load_tensors: CPU buffer size = 70.31 MiB\n", + "llm_load_tensors: CUDA0 buffer size = 3820.94 MiB\n", + "..................................................................................................\n", + "llama_new_context_with_model: n_ctx = 4096\n", + "llama_new_context_with_model: n_batch = 512\n", + "llama_new_context_with_model: n_ubatch = 512\n", + "llama_new_context_with_model: flash_attn = 0\n", + "llama_new_context_with_model: freq_base = 10000.0\n", + "llama_new_context_with_model: freq_scale = 1\n", + "llama_kv_cache_init: CUDA0 KV buffer size = 2048.00 MiB\n", + "llama_new_context_with_model: KV self size = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB\n", + "llama_new_context_with_model: CUDA_Host output buffer size = 0.12 MiB\n", + "ggml_gallocr_reserve_n: reallocating CUDA0 buffer from size 0.00 MiB to 296.00 MiB\n", + "ggml_gallocr_reserve_n: reallocating CUDA_Host buffer from size 0.00 MiB to 16.01 MiB\n", + "llama_new_context_with_model: CUDA0 compute buffer size = 296.00 MiB\n", + "llama_new_context_with_model: CUDA_Host compute buffer size = 16.01 MiB\n", + "llama_new_context_with_model: graph nodes = 1030\n", + "llama_new_context_with_model: graph splits = 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] initializing slots n_slots=1\n", + "[INFO] new slot slot_id=0 n_ctx_slot=4096\n", + "[INFO] model loaded\n", + "[INFO] chat template chat_example=\"You are a helpful assistant\\n\\nUSER: Hello\\nASSISTANT: Hi there\\nUSER: How are you?\\nASSISTANT:\" built_in=false\n", + "[INFO] all slots are idle and system prompt is empty, clear the KV cache\n", + "[INFO] slot is processing task slot_id=0 task_id=0\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=0 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 76.17 ms by CLIP ( 0.13 ms per image patch)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "llama_output_reserve: reallocating output buffer from size 0.12 MiB to 1.22 MiB\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 481.17 ms / 1 tokens ( 481.17 ms per token, 2.08 tokens per second) slot_id=0 task_id=0 t_prompt_processing=481.165 n_prompt_tokens_processed=1 t_token=481.165 n_tokens_second=2.078289152369769\n", + "[INFO] generation eval time = 757.27 ms / 40 runs ( 18.93 ms per token, 52.82 tokens per second) slot_id=0 task_id=0 t_token_generation=757.271 n_decoded=40 t_token=18.931775 n_tokens_second=52.821248932020374\n", + "[INFO] total time = 1238.44 ms slot_id=0 task_id=0 t_prompt_processing=481.165 t_token_generation=757.271 t_total=1238.436\n", + "[INFO] slot released slot_id=0 task_id=0 n_ctx=4096 n_past=632 n_system_tokens=0 n_cache_tokens=41 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=1\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=1 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 48.94 ms by CLIP ( 0.08 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 418.86 ms / 1 tokens ( 418.86 ms per token, 2.39 tokens per second) slot_id=0 task_id=1 t_prompt_processing=418.858 n_prompt_tokens_processed=1 t_token=418.858 n_tokens_second=2.387443954753162\n", + "[INFO] generation eval time = 760.78 ms / 40 runs ( 19.02 ms per token, 52.58 tokens per second) slot_id=0 task_id=1 t_token_generation=760.785 n_decoded=40 t_token=19.019624999999998 n_tokens_second=52.57727215967718\n", + "[INFO] total time = 1179.64 ms slot_id=0 task_id=1 t_prompt_processing=418.858 t_token_generation=760.785 t_total=1179.643\n", + "[INFO] slot released slot_id=0 task_id=1 n_ctx=4096 n_past=632 n_system_tokens=0 n_cache_tokens=41 truncated=false\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/18 13:46:37 WARN DAGScheduler: Broadcasting large task binary with size 1090.9 KiB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] slot is processing task slot_id=0 task_id=84\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=84 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 51.93 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 434.93 ms / 1 tokens ( 434.93 ms per token, 2.30 tokens per second) slot_id=0 task_id=84 t_prompt_processing=434.926 n_prompt_tokens_processed=1 t_token=434.926 n_tokens_second=2.2992417100840146\n", + "[INFO] generation eval time = 759.00 ms / 40 runs ( 18.98 ms per token, 52.70 tokens per second) slot_id=0 task_id=84 t_token_generation=759.003 n_decoded=40 t_token=18.975075 n_tokens_second=52.70071396292241\n", + "[INFO] total time = 1193.93 ms slot_id=0 task_id=84 t_prompt_processing=434.926 t_token_generation=759.003 t_total=1193.929\n", + "[INFO] slot released slot_id=0 task_id=84 n_ctx=4096 n_past=632 n_system_tokens=0 n_cache_tokens=41 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=85\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=85 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 49.35 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "encode_image_with_clip: image embedding created: 576 tokens (1 + 3) / 4]\n", + "\n", + "encode_image_with_clip: image encoded in 50.33 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 418.23 ms / 1 tokens ( 418.23 ms per token, 2.39 tokens per second) slot_id=0 task_id=85 t_prompt_processing=418.234 n_prompt_tokens_processed=1 t_token=418.234 n_tokens_second=2.391005991861016\n", + "[INFO] generation eval time = 310.67 ms / 17 runs ( 18.27 ms per token, 54.72 tokens per second) slot_id=0 task_id=85 t_token_generation=310.665 n_decoded=17 t_token=18.274411764705885 n_tokens_second=54.72132361225113\n", + "[INFO] total time = 728.90 ms slot_id=0 task_id=85 t_prompt_processing=418.234 t_token_generation=310.665 t_total=728.899\n", + "[INFO] slot released slot_id=0 task_id=85 n_ctx=4096 n_past=609 n_system_tokens=0 n_cache_tokens=18 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=87\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=87 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 423.11 ms / 1 tokens ( 423.11 ms per token, 2.36 tokens per second) slot_id=0 task_id=87 t_prompt_processing=423.106 n_prompt_tokens_processed=1 t_token=423.106 n_tokens_second=2.3634739285190944\n", + "[INFO] generation eval time = 771.11 ms / 40 runs ( 19.28 ms per token, 51.87 tokens per second) slot_id=0 task_id=87 t_token_generation=771.106 n_decoded=40 t_token=19.27765 n_tokens_second=51.873542677660396\n", + "[INFO] total time = 1194.21 ms slot_id=0 task_id=87 t_prompt_processing=423.106 t_token_generation=771.106 t_total=1194.212\n", + "[INFO] slot released slot_id=0 task_id=87 n_ctx=4096 n_past=632 n_system_tokens=0 n_cache_tokens=41 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=88\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=88 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 50.07 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 423.79 ms / 1 tokens ( 423.79 ms per token, 2.36 tokens per second) slot_id=0 task_id=88 t_prompt_processing=423.79 n_prompt_tokens_processed=1 t_token=423.79 n_tokens_second=2.359659265202105\n", + "[INFO] generation eval time = 251.86 ms / 14 runs ( 17.99 ms per token, 55.59 tokens per second) slot_id=0 task_id=88 t_token_generation=251.863 n_decoded=14 t_token=17.990214285714284 n_tokens_second=55.58577480614461\n", + "[INFO] total time = 675.65 ms slot_id=0 task_id=88 t_prompt_processing=423.79 t_token_generation=251.863 t_total=675.653\n", + "[INFO] slot released slot_id=0 task_id=88 n_ctx=4096 n_past=606 n_system_tokens=0 n_cache_tokens=15 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=89\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=89 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 49.78 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 50.26 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 422.05 ms / 1 tokens ( 422.05 ms per token, 2.37 tokens per second) slot_id=0 task_id=89 t_prompt_processing=422.047 n_prompt_tokens_processed=1 t_token=422.047 n_tokens_second=2.369404355439086\n", + "[INFO] generation eval time = 351.31 ms / 19 runs ( 18.49 ms per token, 54.08 tokens per second) slot_id=0 task_id=89 t_token_generation=351.31 n_decoded=19 t_token=18.49 n_tokens_second=54.08328826392644\n", + "[INFO] total time = 773.36 ms slot_id=0 task_id=89 t_prompt_processing=422.047 t_token_generation=351.31 t_total=773.357\n", + "[INFO] slot released slot_id=0 task_id=89 n_ctx=4096 n_past=611 n_system_tokens=0 n_cache_tokens=20 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=90\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=90 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 419.07 ms / 1 tokens ( 419.07 ms per token, 2.39 tokens per second) slot_id=0 task_id=90 t_prompt_processing=419.071 n_prompt_tokens_processed=1 t_token=419.071 n_tokens_second=2.386230495548487\n", + "[INFO] generation eval time = 768.85 ms / 40 runs ( 19.22 ms per token, 52.03 tokens per second) slot_id=0 task_id=90 t_token_generation=768.849 n_decoded=40 t_token=19.221225 n_tokens_second=52.0258204146718\n", + "[INFO] total time = 1187.92 ms slot_id=0 task_id=90 t_prompt_processing=419.071 t_token_generation=768.849 t_total=1187.92\n", + "[INFO] slot released slot_id=0 task_id=90 n_ctx=4096 n_past=632 n_system_tokens=0 n_cache_tokens=41 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=91\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=91 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 49.82 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 424.45 ms / 1 tokens ( 424.45 ms per token, 2.36 tokens per second) slot_id=0 task_id=91 t_prompt_processing=424.45 n_prompt_tokens_processed=1 t_token=424.45 n_tokens_second=2.3559901048415597\n", + "[INFO] generation eval time = 761.95 ms / 40 runs ( 19.05 ms per token, 52.50 tokens per second) slot_id=0 task_id=91 t_token_generation=761.953 n_decoded=40 t_token=19.048825 n_tokens_second=52.49667630418149\n", + "[INFO] total time = 1186.40 ms slot_id=0 task_id=91 t_prompt_processing=424.45 t_token_generation=761.953 t_total=1186.403\n", + "[INFO] slot released slot_id=0 task_id=91 n_ctx=4096 n_past=632 n_system_tokens=0 n_cache_tokens=41 truncated=false\n", + "[INFO] slot is processing task slot_id=0 task_id=92\n", + "[INFO] kv cache rm [p0, end) slot_id=0 task_id=92 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "encode_image_with_clip: image embedding created: 576 tokens\n", + "\n", + "encode_image_with_clip: image encoded in 49.04 ms by CLIP ( 0.09 ms per image patch)\n", + "ggml_gallocr_needs_realloc: node inp_embd is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] prompt eval time = 417.26 ms / 1 tokens ( 417.26 ms per token, 2.40 tokens per second) slot_id=0 task_id=92 t_prompt_processing=417.263 n_prompt_tokens_processed=1 t_token=417.263 n_tokens_second=2.3965700289745318\n", + "[INFO] generation eval time = 329.49 ms / 18 runs ( 18.31 ms per token, 54.63 tokens per second) slot_id=0 task_id=92 t_token_generation=329.493 n_decoded=18 t_token=18.305166666666665 n_tokens_second=54.629385146270174\n", + "[INFO] total time = 746.76 ms slot_id=0 task_id=92 t_prompt_processing=417.263 t_token_generation=329.493 t_total=746.756\n", + "[INFO] slot released slot_id=0 task_id=92 n_ctx=4096 n_past=610 n_system_tokens=0 n_cache_tokens=19 truncated=false\n", + "+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|image_name |result |\n", + "+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|palace.JPEG |[ The image depicts a large, ornate room with high ceilings and yellow walls. It features an elegant sitting area with several chairs arranged around the space. There are also multiple c] |\n", + "|egyptian_cat.jpeg|[ The image features two cats lying on a pink surface, possibly a bed or sofa. One cat is positioned towards the left side of the frame and appears to be sleeping while holding] |\n", + "|hippopotamus.JPEG|[ A large brown hippo is swimming in a pond, with its head above the water. The hippo appears to be enjoying itself as it floats on top of the water.] |\n", + "|hen.JPEG |[ The image features a large white chicken standing next to several baby chicks. There are at least five visible chickens in the scene, with one adult and four young ones surrounding it. They]|\n", + "|ostrich.JPEG |[ A large ostrich stands in a grassy field, surrounded by trees and bushes. The bird is the main focus of the image with its long neck stretched out as it looks around at] |\n", + "|junco.JPEG |[ A small bird with a black head and white chest is standing on the snow.] |\n", + "|bluetick.jpg |[ A dog with a red collar is sitting on the floor.] |\n", + "|chihuahua.jpg |[ A small brown dog wearing a sweater and collar is sitting on the floor.] |\n", + "|tractor.JPEG |[ A man is sitting in the driver's seat of a green tractor, which has yellow wheels. The tractor appears to be parked on top of an agricultural field with rows of] |\n", + "|ox.JPEG |[ A large bull with long horns is standing in a grassy field.] |\n", + "+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ggml_gallocr_needs_realloc: src 0 (KQ_mask) of node KQ_mask (view) is not valid\n", + "ggml_gallocr_alloc_graph: cannot reallocate multi buffer graph automatically, call reserve\n", + "ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 0)\n", + " \r" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "documentAssembler = (\n", + " DocumentAssembler().setInputCol(\"caption\").setOutputCol(\"caption_document\")\n", + ")\n", + "imageAssembler = ImageAssembler().setInputCol(\"image\").setOutputCol(\"image_assembler\")\n", + "model = AutoGGUFVisionModel.load(\"llava_v1.5_7b_Q4_0_gguf_spark_nlp\")\n", + "pipeline = Pipeline().setStages([documentAssembler, imageAssembler, model])\n", + "\n", + "pipeline.fit(data).transform(data).selectExpr(\n", + " \"reverse(split(image.origin, '/'))[0] as image_name\", \"completions.result\"\n", + ").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of GGUF models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "sparknlp_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/project/Dependencies.scala b/project/Dependencies.scala index fae6267df57f21..ff532c8d56eeb6 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -128,7 +128,7 @@ object Dependencies { val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided val azureStorage = "com.azure" % "azure-storage-blob" % azureStorageVersion % Provided - val llamaCppVersion = "0.1.4" + val llamaCppVersion = "0.1.6" val llamaCppCPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-cpu" % llamaCppVersion val llamaCppGPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-gpu" % llamaCppVersion val llamaCppSilicon = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-silicon" % llamaCppVersion diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py index e9c3984c21ecc1..e946fbc0e472e8 100644 --- a/python/sparknlp/annotator/seq2seq/__init__.py +++ b/python/sparknlp/annotator/seq2seq/__init__.py @@ -22,6 +22,7 @@ from sparknlp.annotator.seq2seq.phi2_transformer import * from sparknlp.annotator.seq2seq.mistral_transformer import * from sparknlp.annotator.seq2seq.auto_gguf_model import * +from sparknlp.annotator.seq2seq.auto_gguf_vision_model import * from sparknlp.annotator.seq2seq.phi3_transformer import * from sparknlp.annotator.seq2seq.nllb_transformer import * from sparknlp.annotator.seq2seq.cpm_transformer import * diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py index d28ac006c9da22..37c96319564782 100755 --- a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py +++ b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py @@ -17,7 +17,7 @@ from sparknlp.common import * -class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate): +class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties): """ Annotator that uses the llama.cpp library to generate text completions with large language models. @@ -241,507 +241,6 @@ class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate): inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.DOCUMENT - # -------- MODEl PARAMETERS -------- - nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation", - typeConverter=TypeConverters.toInt) - nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation", - typeConverter=TypeConverters.toInt) - nThreadsBatch = Param(Params._dummy(), "nThreadsBatch", - "Set the number of threads to use during batch and prompt processing", - typeConverter=TypeConverters.toInt) - nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft", - "Set the number of threads to use during batch and prompt processing", - typeConverter=TypeConverters.toInt) - nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt) - nBatch = Param(Params._dummy(), "nBatch", - "Set the logical batch size for prompt processing (must be >=32 to use BLAS)", - typeConverter=TypeConverters.toInt) - nUbatch = Param(Params._dummy(), "nUbatch", - "Set the physical batch size for prompt processing (must be >=32 to use BLAS)", - typeConverter=TypeConverters.toInt) - nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding", - typeConverter=TypeConverters.toInt) - nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process", - typeConverter=TypeConverters.toInt) - nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode", - typeConverter=TypeConverters.toInt) - pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability", - typeConverter=TypeConverters.toFloat) - nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)", - typeConverter=TypeConverters.toInt) - nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft", - "Set the number of layers to store in VRAM for the draft model (-1 - use default)", - typeConverter=TypeConverters.toInt) - # Set how to split the model across GPUs - # - # - NONE: No GPU split - # - LAYER: Split the model across GPUs by layer - # - ROW: Split the model across GPUs by rows - gpuSplitMode = Param(Params._dummy(), "gpuSplitMode", "Set how to split the model across GPUs", - typeConverter=TypeConverters.toString) - mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.", - typeConverter=TypeConverters.toInt) - tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs", - typeConverter=TypeConverters.toListFloat) - grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt) - grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt) - ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling", - typeConverter=TypeConverters.toFloat) - ropeFreqScale = Param(Params._dummy(), "ropeFreqScale", - "Set the RoPE frequency scaling factor, expands context by a factor of 1/N", - typeConverter=TypeConverters.toFloat) - yarnExtFactor = Param(Params._dummy(), "yarnExtFactor", "Set the YaRN extrapolation mix factor", - typeConverter=TypeConverters.toFloat) - yarnAttnFactor = Param(Params._dummy(), "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude", - typeConverter=TypeConverters.toFloat) - yarnBetaFast = Param(Params._dummy(), "yarnBetaFast", "Set the YaRN low correction dim or beta", - typeConverter=TypeConverters.toFloat) - yarnBetaSlow = Param(Params._dummy(), "yarnBetaSlow", "Set the YaRN high correction dim or alpha", - typeConverter=TypeConverters.toFloat) - yarnOrigCtx = Param(Params._dummy(), "yarnOrigCtx", "Set the YaRN original context size of model", - typeConverter=TypeConverters.toInt) - defragmentationThreshold = Param(Params._dummy(), "defragmentationThreshold", - "Set the KV cache defragmentation threshold", typeConverter=TypeConverters.toFloat) - # Set optimization strategies that help on some NUMA systems (if available) - # - # Available Strategies: - # - # - DISABLED: No NUMA optimizations - # - DISTRIBUTE: Spread execution evenly over all - # - ISOLATE: Only spawn threads on CPUs on the node that execution started on - # - NUMA_CTL: Use the CPU map provided by numactl - # - MIRROR: Mirrors the model across NUMA nodes - numaStrategy = Param(Params._dummy(), "numaStrategy", - "Set optimization strategies that help on some NUMA systems (if available)", - typeConverter=TypeConverters.toString) - # Set the RoPE frequency scaling method, defaults to linear unless specified by the model. - # - # - UNSPECIFIED: Don't use any scaling - # - LINEAR: Linear scaling - # - YARN: YaRN RoPE scaling - ropeScalingType = Param(Params._dummy(), "ropeScalingType", - "Set the RoPE frequency scaling method, defaults to linear unless specified by the model", - typeConverter=TypeConverters.toString) - # Set the pooling type for embeddings, use model default if unspecified - # - # - 0 UNSPECIFIED: Don't use any pooling - # - 1 MEAN: Mean Pooling - # - 2 CLS: CLS Pooling - poolingType = Param(Params._dummy(), "poolingType", - "Set the pooling type for embeddings, use model default if unspecified", - typeConverter=TypeConverters.toString) - modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding", - typeConverter=TypeConverters.toString) - modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString) - lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath", - "Set path to static lookup cache to use for lookup decoding (not updated by generation)", - typeConverter=TypeConverters.toString) - lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath", - "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)", - typeConverter=TypeConverters.toString) - # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") - embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support", - typeConverter=TypeConverters.toBoolean) - flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention", - typeConverter=TypeConverters.toBoolean) - inputPrefixBos = Param(Params._dummy(), "inputPrefixBos", - "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string", - typeConverter=TypeConverters.toBoolean) - useMmap = Param(Params._dummy(), "useMmap", - "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)", - typeConverter=TypeConverters.toBoolean) - useMlock = Param(Params._dummy(), "useMlock", - "Whether to force the system to keep model in RAM rather than swapping or compressing", - typeConverter=TypeConverters.toBoolean) - noKvOffload = Param(Params._dummy(), "noKvOffload", "Whether to disable KV offload", - typeConverter=TypeConverters.toBoolean) - systemPrompt = Param(Params._dummy(), "systemPrompt", "Set a system prompt to use", - typeConverter=TypeConverters.toString) - chatTemplate = Param(Params._dummy(), "chatTemplate", "The chat template to use", - typeConverter=TypeConverters.toString) - - # -------- INFERENCE PARAMETERS -------- - inputPrefix = Param(Params._dummy(), "inputPrefix", "Set the prompt to start generation with", - typeConverter=TypeConverters.toString) - inputSuffix = Param(Params._dummy(), "inputSuffix", "Set a suffix for infilling", - typeConverter=TypeConverters.toString) - cachePrompt = Param(Params._dummy(), "cachePrompt", "Whether to remember the prompt to avoid reprocessing it", - typeConverter=TypeConverters.toBoolean) - nPredict = Param(Params._dummy(), "nPredict", "Set the number of tokens to predict", - typeConverter=TypeConverters.toInt) - topK = Param(Params._dummy(), "topK", "Set top-k sampling", typeConverter=TypeConverters.toInt) - topP = Param(Params._dummy(), "topP", "Set top-p sampling", typeConverter=TypeConverters.toFloat) - minP = Param(Params._dummy(), "minP", "Set min-p sampling", typeConverter=TypeConverters.toFloat) - tfsZ = Param(Params._dummy(), "tfsZ", "Set tail free sampling, parameter z", typeConverter=TypeConverters.toFloat) - typicalP = Param(Params._dummy(), "typicalP", "Set locally typical sampling, parameter p", - typeConverter=TypeConverters.toFloat) - temperature = Param(Params._dummy(), "temperature", "Set the temperature", typeConverter=TypeConverters.toFloat) - dynamicTemperatureRange = Param(Params._dummy(), "dynatempRange", "Set the dynamic temperature range", - typeConverter=TypeConverters.toFloat) - dynamicTemperatureExponent = Param(Params._dummy(), "dynatempExponent", "Set the dynamic temperature exponent", - typeConverter=TypeConverters.toFloat) - repeatLastN = Param(Params._dummy(), "repeatLastN", "Set the last n tokens to consider for penalties", - typeConverter=TypeConverters.toInt) - repeatPenalty = Param(Params._dummy(), "repeatPenalty", "Set the penalty of repeated sequences of tokens", - typeConverter=TypeConverters.toFloat) - frequencyPenalty = Param(Params._dummy(), "frequencyPenalty", "Set the repetition alpha frequency penalty", - typeConverter=TypeConverters.toFloat) - presencePenalty = Param(Params._dummy(), "presencePenalty", "Set the repetition alpha presence penalty", - typeConverter=TypeConverters.toFloat) - miroStat = Param(Params._dummy(), "miroStat", "Set MiroStat sampling strategies.", - typeConverter=TypeConverters.toString) - miroStatTau = Param(Params._dummy(), "mirostatTau", "Set the MiroStat target entropy, parameter tau", - typeConverter=TypeConverters.toFloat) - miroStatEta = Param(Params._dummy(), "mirostatEta", "Set the MiroStat learning rate, parameter eta", - typeConverter=TypeConverters.toFloat) - penalizeNl = Param(Params._dummy(), "penalizeNl", "Whether to penalize newline tokens", - typeConverter=TypeConverters.toBoolean) - nKeep = Param(Params._dummy(), "nKeep", "Set the number of tokens to keep from the initial prompt", - typeConverter=TypeConverters.toInt) - seed = Param(Params._dummy(), "seed", "Set the RNG seed", typeConverter=TypeConverters.toInt) - nProbs = Param(Params._dummy(), "nProbs", "Set the amount top tokens probabilities to output if greater than 0.", - typeConverter=TypeConverters.toInt) - minKeep = Param(Params._dummy(), "minKeep", - "Set the amount of tokens the samplers should return at least (0 = disabled)", - typeConverter=TypeConverters.toInt) - grammar = Param(Params._dummy(), "grammar", "Set BNF-like grammar to constrain generations", - typeConverter=TypeConverters.toString) - penaltyPrompt = Param(Params._dummy(), "penaltyPrompt", - "Override which part of the prompt is penalized for repetition.", - typeConverter=TypeConverters.toString) - ignoreEos = Param(Params._dummy(), "ignoreEos", - "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)", - typeConverter=TypeConverters.toBoolean) - disableTokenIds = Param(Params._dummy(), "disableTokenIds", "Set the token ids to disable in the completion", - typeConverter=TypeConverters.toListInt) - stopStrings = Param(Params._dummy(), "stopStrings", "Set strings upon seeing which token generation is stopped", - typeConverter=TypeConverters.toListString) - samplers = Param(Params._dummy(), "samplers", "Set which samplers to use for token generation in the given order", - typeConverter=TypeConverters.toListString) - useChatTemplate = Param(Params._dummy(), "useChatTemplate", - "Set whether or not generate should apply a chat template", - typeConverter=TypeConverters.toBoolean) - - # -------- MODEL SETTERS -------- - def setNThreads(self, nThreads: int): - """Set the number of threads to use during generation""" - return self._set(nThreads=nThreads) - - def setNThreadsDraft(self, nThreadsDraft: int): - """Set the number of threads to use during draft generation""" - return self._set(nThreadsDraft=nThreadsDraft) - - def setNThreadsBatch(self, nThreadsBatch: int): - """Set the number of threads to use during batch and prompt processing""" - return self._set(nThreadsBatch=nThreadsBatch) - - def setNThreadsBatchDraft(self, nThreadsBatchDraft: int): - """Set the number of threads to use during batch and prompt processing""" - return self._set(nThreadsBatchDraft=nThreadsBatchDraft) - - def setNCtx(self, nCtx: int): - """Set the size of the prompt context""" - return self._set(nCtx=nCtx) - - def setNBatch(self, nBatch: int): - """Set the logical batch size for prompt processing (must be >=32 to use BLAS)""" - return self._set(nBatch=nBatch) - - def setNUbatch(self, nUbatch: int): - """Set the physical batch size for prompt processing (must be >=32 to use BLAS)""" - return self._set(nUbatch=nUbatch) - - def setNDraft(self, nDraft: int): - """Set the number of tokens to draft for speculative decoding""" - return self._set(nDraft=nDraft) - - def setNChunks(self, nChunks: int): - """Set the maximal number of chunks to process""" - return self._set(nChunks=nChunks) - - def setNSequences(self, nSequences: int): - """Set the number of sequences to decode""" - return self._set(nSequences=nSequences) - - def setPSplit(self, pSplit: float): - """Set the speculative decoding split probability""" - return self._set(pSplit=pSplit) - - def setNGpuLayers(self, nGpuLayers: int): - """Set the number of layers to store in VRAM (-1 - use default)""" - return self._set(nGpuLayers=nGpuLayers) - - def setNGpuLayersDraft(self, nGpuLayersDraft: int): - """Set the number of layers to store in VRAM for the draft model (-1 - use default)""" - return self._set(nGpuLayersDraft=nGpuLayersDraft) - - def setGpuSplitMode(self, gpuSplitMode: str): - """Set how to split the model across GPUs""" - return self._set(gpuSplitMode=gpuSplitMode) - - def setMainGpu(self, mainGpu: int): - """Set the main GPU that is used for scratch and small tensors.""" - return self._set(mainGpu=mainGpu) - - def setTensorSplit(self, tensorSplit: List[float]): - """Set how split tensors should be distributed across GPUs""" - return self._set(tensorSplit=tensorSplit) - - def setGrpAttnN(self, grpAttnN: int): - """Set the group-attention factor""" - return self._set(grpAttnN=grpAttnN) - - def setGrpAttnW(self, grpAttnW: int): - """Set the group-attention width""" - return self._set(grpAttnW=grpAttnW) - - def setRopeFreqBase(self, ropeFreqBase: float): - """Set the RoPE base frequency, used by NTK-aware scaling""" - return self._set(ropeFreqBase=ropeFreqBase) - - def setRopeFreqScale(self, ropeFreqScale: float): - """Set the RoPE frequency scaling factor, expands context by a factor of 1/N""" - return self._set(ropeFreqScale=ropeFreqScale) - - def setYarnExtFactor(self, yarnExtFactor: float): - """Set the YaRN extrapolation mix factor""" - return self._set(yarnExtFactor=yarnExtFactor) - - def setYarnAttnFactor(self, yarnAttnFactor: float): - """Set the YaRN scale sqrt(t) or attention magnitude""" - return self._set(yarnAttnFactor=yarnAttnFactor) - - def setYarnBetaFast(self, yarnBetaFast: float): - """Set the YaRN low correction dim or beta""" - return self._set(yarnBetaFast=yarnBetaFast) - - def setYarnBetaSlow(self, yarnBetaSlow: float): - """Set the YaRN high correction dim or alpha""" - return self._set(yarnBetaSlow=yarnBetaSlow) - - def setYarnOrigCtx(self, yarnOrigCtx: int): - """Set the YaRN original context size of model""" - return self._set(yarnOrigCtx=yarnOrigCtx) - - def setDefragmentationThreshold(self, defragmentationThreshold: float): - """Set the KV cache defragmentation threshold""" - return self._set(defragmentationThreshold=defragmentationThreshold) - - def setNumaStrategy(self, numaStrategy: str): - """Set optimization strategies that help on some NUMA systems (if available)""" - numaUpper = numaStrategy.upper() - numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"] - if numaUpper not in numaStrategies: - raise ValueError( - f"Invalid NUMA strategy: {numaUpper}. " - + f"Valid values are: {numaStrategies}" - ) - return self._set(numaStrategy=numaStrategy) - - def setRopeScalingType(self, ropeScalingType: str): - """Set the RoPE frequency scaling method, defaults to linear unless specified by the model""" - return self._set(ropeScalingType=ropeScalingType) - - def setPoolingType(self, poolingType: bool): - """Set the pooling type for embeddings, use model default if unspecified""" - poolingTypeUpper = poolingType.upper() - poolingTypes = ["NONE", "MEAN", "CLS", "LAST"] - if poolingTypeUpper not in poolingTypes: - raise ValueError( - f"Invalid pooling type: {poolingType}. " - + f"Valid values are: {poolingTypes}" - ) - return self._set(poolingType=poolingType) - - def setModelDraft(self, modelDraft: str): - """Set the draft model for speculative decoding""" - return self._set(modelDraft=modelDraft) - - def setModelAlias(self, modelAlias: str): - """Set a model alias""" - return self._set(modelAlias=modelAlias) - - def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str): - """Set path to static lookup cache to use for lookup decoding (not updated by generation)""" - return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath) - - def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str): - """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)""" - return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath) - - def setEmbedding(self, embedding: bool): - """Whether to load model with embedding support""" - return self._set(embedding=embedding) - - def setFlashAttention(self, flashAttention: bool): - """Whether to enable Flash Attention""" - return self._set(flashAttention=flashAttention) - - def setInputPrefixBos(self, inputPrefixBos: bool): - """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` bool""" - return self._set(inputPrefixBos=inputPrefixBos) - - def setUseMmap(self, useMmap: bool): - """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)""" - return self._set(useMmap=useMmap) - - def setUseMlock(self, useMlock: bool): - """Whether to force the system to keep model in RAM rather than swapping or compressing""" - return self._set(useMlock=useMlock) - - def setNoKvOffload(self, noKvOffload: bool): - """Whether to disable KV offload""" - return self._set(noKvOffload=noKvOffload) - - def setSystemPrompt(self, systemPrompt: bool): - """Set a system prompt to use""" - return self._set(systemPrompt=systemPrompt) - - def setChatTemplate(self, chatTemplate: str): - """The chat template to use""" - return self._set(chatTemplate=chatTemplate) - - # -------- INFERENCE SETTERS -------- - def setInputPrefix(self, inputPrefix: str): - """Set the prompt to start generation with""" - return self._set(inputPrefix=inputPrefix) - - def setInputSuffix(self, inputSuffix: str): - """Set a suffix for infilling""" - return self._set(inputSuffix=inputSuffix) - - def setCachePrompt(self, cachePrompt: bool): - """Whether to remember the prompt to avoid reprocessing it""" - return self._set(cachePrompt=cachePrompt) - - def setNPredict(self, nPredict: int): - """Set the number of tokens to predict""" - return self._set(nPredict=nPredict) - - def setTopK(self, topK: int): - """Set top-k sampling""" - return self._set(topK=topK) - - def setTopP(self, topP: float): - """Set top-p sampling""" - return self._set(topP=topP) - - def setMinP(self, minP: float): - """Set min-p sampling""" - return self._set(minP=minP) - - def setTfsZ(self, tfsZ: float): - """Set tail free sampling, parameter z""" - return self._set(tfsZ=tfsZ) - - def setTypicalP(self, typicalP: float): - """Set locally typical sampling, parameter p""" - return self._set(typicalP=typicalP) - - def setTemperature(self, temperature: float): - """Set the temperature""" - return self._set(temperature=temperature) - - def setDynamicTemperatureRange(self, dynamicTemperatureRange: float): - """Set the dynamic temperature range""" - return self._set(dynamicTemperatureRange=dynamicTemperatureRange) - - def setDynamicTemperatureExponent(self, dynamicTemperatureExponent: float): - """Set the dynamic temperature exponent""" - return self._set(dynamicTemperatureExponent=dynamicTemperatureExponent) - - def setRepeatLastN(self, repeatLastN: int): - """Set the last n tokens to consider for penalties""" - return self._set(repeatLastN=repeatLastN) - - def setRepeatPenalty(self, repeatPenalty: float): - """Set the penalty of repeated sequences of tokens""" - return self._set(repeatPenalty=repeatPenalty) - - def setFrequencyPenalty(self, frequencyPenalty: float): - """Set the repetition alpha frequency penalty""" - return self._set(frequencyPenalty=frequencyPenalty) - - def setPresencePenalty(self, presencePenalty: float): - """Set the repetition alpha presence penalty""" - return self._set(presencePenalty=presencePenalty) - - def setMiroStat(self, miroStat: str): - """Set MiroStat sampling strategies.""" - return self._set(miroStat=miroStat) - - def setMiroStatTau(self, miroStatTau: float): - """Set the MiroStat target entropy, parameter tau""" - return self._set(miroStatTau=miroStatTau) - - def setMiroStatEta(self, miroStatEta: float): - """Set the MiroStat learning rate, parameter eta""" - return self._set(miroStatEta=miroStatEta) - - def setPenalizeNl(self, penalizeNl: bool): - """Whether to penalize newline tokens""" - return self._set(penalizeNl=penalizeNl) - - def setNKeep(self, nKeep: int): - """Set the number of tokens to keep from the initial prompt""" - return self._set(nKeep=nKeep) - - def setSeed(self, seed: int): - """Set the RNG seed""" - return self._set(seed=seed) - - def setNProbs(self, nProbs: int): - """Set the amount top tokens probabilities to output if greater than 0.""" - return self._set(nProbs=nProbs) - - def setMinKeep(self, minKeep: int): - """Set the amount of tokens the samplers should return at least (0 = disabled)""" - return self._set(minKeep=minKeep) - - def setGrammar(self, grammar: bool): - """Set BNF-like grammar to constrain generations""" - return self._set(grammar=grammar) - - def setPenaltyPrompt(self, penaltyPrompt: str): - """Override which part of the prompt is penalized for repetition.""" - return self._set(penaltyPrompt=penaltyPrompt) - - def setIgnoreEos(self, ignoreEos: bool): - """Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)""" - return self._set(ignoreEos=ignoreEos) - - def setDisableTokenIds(self, disableTokenIds: List[int]): - """Set the token ids to disable in the completion""" - return self._set(disableTokenIds=disableTokenIds) - - def setStopStrings(self, stopStrings: List[str]): - """Set strings upon seeing which token generation is stopped""" - return self._set(stopStrings=stopStrings) - - def setSamplers(self, samplers: List[str]): - """Set which samplers to use for token generation in the given order""" - return self._set(samplers=samplers) - - def setUseChatTemplate(self, useChatTemplate: bool): - """Set whether generate should apply a chat template""" - return self._set(useChatTemplate=useChatTemplate) - - # -------- JAVA SETTERS -------- - def setTokenIdBias(self, tokenIdBias: Dict[int, float]): - """Set token id bias""" - return self._call_java("setTokenIdBias", tokenIdBias) - - def setTokenBias(self, tokenBias: Dict[str, float]): - """Set token id bias""" - return self._call_java("setTokenBias", tokenBias) - - def setLoraAdapters(self, loraAdapters: Dict[str, float]): - """Set token id bias""" - return self._call_java("setLoraAdapters", loraAdapters) - - def getMetadata(self): - """Gets the metadata of the model""" - return self._call_java("getMetadata") @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel", java_model=None): @@ -749,7 +248,13 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFMo classname=classname, java_model=java_model ) - # self._setDefault() + self._setDefault( + useChatTemplate=True, + nCtx=4096, + nBatch=512, + embedding=False, + nPredict=100 + ) @staticmethod def loadSavedModel(folder, spark_session): diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_vision_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_vision_model.py new file mode 100755 index 00000000000000..b05150ed3b9905 --- /dev/null +++ b/python/sparknlp/annotator/seq2seq/auto_gguf_vision_model.py @@ -0,0 +1,333 @@ +# Copyright 2017-2025 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for the AutoGGUFVisionModel.""" +from sparknlp.common import * + + +class AutoGGUFVisionModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties): + """Multimodal annotator that uses the llama.cpp library to generate text completions with large + language models. It supports ingesting images for captioning. + + At the moment only CLIP based models are supported. + + For settable parameters, and their explanations, see HasLlamaCppInferenceProperties, + HasLlamaCppModelProperties and refer to the llama.cpp documentation of + `server.cpp `__ + for more information. + + If the parameters are not set, the annotator will default to use the parameters provided by + the model. + + This annotator expects a column of annotator type AnnotationImage for the image and + Annotation for the caption. Note that the image bytes in the image annotation need to be + raw image bytes without preprocessing. We provide the helper function + ImageAssembler.loadImagesAsBytes to load the image bytes from a directory. + + Pretrained models can be loaded with ``pretrained`` of the companion object: + + .. code-block:: python + + autoGGUFVisionModel = AutoGGUFVisionModel.pretrained() \\ + .setInputCols(["image", "document"]) \\ + .setOutputCol("completions") + + + The default model is ``"llava_v1.5_7b_Q4_0_gguf"``, if no name is provided. + + For available pretrained models please see the `Models Hub `__. + + For extended examples of usage, see the + `AutoGGUFVisionModelTest `__ + and the + `example notebook `__. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``IMAGE, DOCUMENT`` ``DOCUMENT`` + ====================== ====================== + + Parameters + ---------- + nThreads + Set the number of threads to use during generation + nThreadsDraft + Set the number of threads to use during draft generation + nThreadsBatch + Set the number of threads to use during batch and prompt processing + nThreadsBatchDraft + Set the number of threads to use during batch and prompt processing + nCtx + Set the size of the prompt context + nBatch + Set the logical batch size for prompt processing (must be >=32 to use BLAS) + nUbatch + Set the physical batch size for prompt processing (must be >=32 to use BLAS) + nDraft + Set the number of tokens to draft for speculative decoding + nChunks + Set the maximal number of chunks to process + nSequences + Set the number of sequences to decode + pSplit + Set the speculative decoding split probability + nGpuLayers + Set the number of layers to store in VRAM (-1 - use default) + nGpuLayersDraft + Set the number of layers to store in VRAM for the draft model (-1 - use default) + gpuSplitMode + Set how to split the model across GPUs + mainGpu + Set the main GPU that is used for scratch and small tensors. + tensorSplit + Set how split tensors should be distributed across GPUs + grpAttnN + Set the group-attention factor + grpAttnW + Set the group-attention width + ropeFreqBase + Set the RoPE base frequency, used by NTK-aware scaling + ropeFreqScale + Set the RoPE frequency scaling factor, expands context by a factor of 1/N + yarnExtFactor + Set the YaRN extrapolation mix factor + yarnAttnFactor + Set the YaRN scale sqrt(t) or attention magnitude + yarnBetaFast + Set the YaRN low correction dim or beta + yarnBetaSlow + Set the YaRN high correction dim or alpha + yarnOrigCtx + Set the YaRN original context size of model + defragmentationThreshold + Set the KV cache defragmentation threshold + numaStrategy + Set optimization strategies that help on some NUMA systems (if available) + ropeScalingType + Set the RoPE frequency scaling method, defaults to linear unless specified by the model + poolingType + Set the pooling type for embeddings, use model default if unspecified + modelDraft + Set the draft model for speculative decoding + modelAlias + Set a model alias + lookupCacheStaticFilePath + Set path to static lookup cache to use for lookup decoding (not updated by generation) + lookupCacheDynamicFilePath + Set path to dynamic lookup cache to use for lookup decoding (updated by generation) + embedding + Whether to load model with embedding support + flashAttention + Whether to enable Flash Attention + inputPrefixBos + Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string + useMmap + Whether to use memory-map model (faster load but may increase pageouts if not using mlock) + useMlock + Whether to force the system to keep model in RAM rather than swapping or compressing + noKvOffload + Whether to disable KV offload + systemPrompt + Set a system prompt to use + chatTemplate + The chat template to use + inputPrefix + Set the prompt to start generation with + inputSuffix + Set a suffix for infilling + cachePrompt + Whether to remember the prompt to avoid reprocessing it + nPredict + Set the number of tokens to predict + topK + Set top-k sampling + topP + Set top-p sampling + minP + Set min-p sampling + tfsZ + Set tail free sampling, parameter z + typicalP + Set locally typical sampling, parameter p + temperature + Set the temperature + dynatempRange + Set the dynamic temperature range + dynatempExponent + Set the dynamic temperature exponent + repeatLastN + Set the last n tokens to consider for penalties + repeatPenalty + Set the penalty of repeated sequences of tokens + frequencyPenalty + Set the repetition alpha frequency penalty + presencePenalty + Set the repetition alpha presence penalty + miroStat + Set MiroStat sampling strategies. + mirostatTau + Set the MiroStat target entropy, parameter tau + mirostatEta + Set the MiroStat learning rate, parameter eta + penalizeNl + Whether to penalize newline tokens + nKeep + Set the number of tokens to keep from the initial prompt + seed + Set the RNG seed + nProbs + Set the amount top tokens probabilities to output if greater than 0. + minKeep + Set the amount of tokens the samplers should return at least (0 = disabled) + grammar + Set BNF-like grammar to constrain generations + penaltyPrompt + Override which part of the prompt is penalized for repetition. + ignoreEos + Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf) + disableTokenIds + Set the token ids to disable in the completion + stopStrings + Set strings upon seeing which token generation is stopped + samplers + Set which samplers to use for token generation in the given order + useChatTemplate + Set whether or not generate should apply a chat template + + Notes + ----- + To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set + the number of GPU layers with the `setNGpuLayers` method. + + When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` + according to your hardware to avoid out-of-memory errors. + + Examples + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> from pyspark.sql.functions import lit + >>> documentAssembler = DocumentAssembler() \\ + ... .setInputCol("caption") \\ + ... .setOutputCol("caption_document") + >>> imageAssembler = ImageAssembler() \\ + ... .setInputCol("image") \\ + ... .setOutputCol("image_assembler") + >>> imagesPath = "src/test/resources/image/" + >>> data = ImageAssembler \\ + ... .loadImagesAsBytes(spark, imagesPath) \\ + ... .withColumn("caption", lit("Caption this image.")) # Add a caption to each image. + >>> nPredict = 40 + >>> model = AutoGGUFVisionModel.pretrained() \\ + ... .setInputCols(["caption_document", "image_assembler"]) \\ + ... .setOutputCol("completions") \\ + ... .setBatchSize(4) \\ + ... .setNGpuLayers(99) \\ + ... .setNCtx(4096) \\ + ... .setMinKeep(0) \\ + ... .setMinP(0.05) \\ + ... .setNPredict(nPredict) \\ + ... .setNProbs(0) \\ + ... .setPenalizeNl(False) \\ + ... .setRepeatLastN(256) \\ + ... .setRepeatPenalty(1.18) \\ + ... .setStopStrings(["", "Llama:", "User:"]) \\ + ... .setTemperature(0.05) \\ + ... .setTfsZ(1) \\ + ... .setTypicalP(1) \\ + ... .setTopK(40) \\ + ... .setTopP(0.95) + >>> pipeline = Pipeline().setStages([documentAssembler, imageAssembler, model]) + >>> pipeline.fit(data).transform(data) \\ + ... .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "completions.result") \\ + ... .show(truncate = False) + +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |image_name |result | + +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |palace.JPEG |[ The image depicts a large, ornate room with high ceilings and beautifully decorated walls. There are several chairs placed throughout the space, some of which have cushions] | + |egyptian_cat.jpeg|[ The image features two cats lying on a pink surface, possibly a bed or sofa. One cat is positioned towards the left side of the scene and appears to be sleeping while holding] | + |hippopotamus.JPEG|[ A large brown hippo is swimming in a body of water, possibly an aquarium. The hippo appears to be enjoying its time in the water and seems relaxed as it floats] | + |hen.JPEG |[ The image features a large chicken standing next to several baby chickens. In total, there are five birds in the scene: one adult and four young ones. They appear to be gathered together] | + |ostrich.JPEG |[ The image features a large, long-necked bird standing in the grass. It appears to be an ostrich or similar species with its head held high and looking around. In addition to] | + |junco.JPEG |[ A small bird with a black head and white chest is standing on the snow. It appears to be looking at something, possibly food or another animal in its vicinity. The scene takes place out] | + |bluetick.jpg |[ A dog with a red collar is sitting on the floor, looking at something. The dog appears to be staring into the distance or focusing its attention on an object in front of it.] | + |chihuahua.jpg |[ A small brown dog wearing a sweater is sitting on the floor. The dog appears to be looking at something, possibly its owner or another animal in the room. It seems comfortable and relaxed]| + |tractor.JPEG |[ A man is sitting in the driver's seat of a green tractor, which has yellow wheels and tires. The tractor appears to be parked on top of an empty field with] | + |ox.JPEG |[ A large bull with horns is standing in a grassy field.] | + +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------- + """ + + name = "AutoGGUFVisionModel" + inputAnnotatorTypes = [AnnotatorType.IMAGE, AnnotatorType.DOCUMENT] + outputAnnotatorType = AnnotatorType.DOCUMENT + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFVisionModel", java_model=None): + super(AutoGGUFVisionModel, self).__init__( + classname=classname, + java_model=java_model + ) + + self._setDefault( + useChatTemplate=True, + nCtx=4096, + nBatch=512, + embedding=False, + nPredict=100 + ) + + @staticmethod + def loadSavedModel(modelPath, mmprojPath, spark_session): + """Loads a locally saved modelPath. + + Parameters + ---------- + modelPath : str + Path to the modelPath file + mmprojPath : str + Path to the mmprojPath file + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + AutoGGUFVisionModel + The restored modelPath + """ + from sparknlp.internal import _AutoGGUFVisionLoader + jModel = _AutoGGUFVisionLoader(modelPath, mmprojPath, spark_session._jsparkSession)._java_obj + return AutoGGUFVisionModel(java_model=jModel) + + @staticmethod + def pretrained(name="llava_v1.5_7b_Q4_0_gguf", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default "llava_v1.5_7b_Q4_0_gguf" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + AutoGGUFVisionModel + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(AutoGGUFVisionModel, name, lang, remote_loc) diff --git a/python/sparknlp/base/image_assembler.py b/python/sparknlp/base/image_assembler.py index cc8a9eb8c91253..61d4a283cdbb60 100644 --- a/python/sparknlp/base/image_assembler.py +++ b/python/sparknlp/base/image_assembler.py @@ -15,6 +15,8 @@ from pyspark import keyword_only from pyspark.ml.param import TypeConverters, Params, Param +from pyspark.sql import SparkSession, DataFrame +from pyspark.sql.functions import regexp_replace, col from sparknlp.common import AnnotatorType from sparknlp.internal import AnnotatorTransformer @@ -112,3 +114,59 @@ def setTextCol(self, value): Name of an optional input text column """ return self._set(inputCol=value) + + @classmethod + def loadImagesAsBytes(cls, spark: SparkSession, path: str): + """ + Loads images from a given path and returns them as raw bytes, instead of the default + OpenCV-compatible format. Supported image types include JPEG, PNG, GIF, and BMP. + + Multimodal inference with llama.cpp requires raw bytes as input. + + Parameters + ---------- + spark : SparkSession + The active SparkSession. + path : str + The path to the images. Supported image types are JPEG, PNG, GIF, and BMP. + + Returns + ------- + DataFrame + A DataFrame containing the images as raw bytes along with their metadata. + """ + + # Replace the path separator in the `origin` field and `path` column, so that they match + def replace_path(column_name: str): + return regexp_replace(col(column_name), ":///", ":/") + + # Load the images as metadata with the default Spark image format + data = ( + spark.read.format("image") + .option("dropInvalid", True) + .load(path) + .withColumn( + "image", col("image").withField("origin", replace_path("image.origin")) + ) + ) + + # Load the images as raw binary files + image_bytes = ( + spark.read.format("binaryFile") + .option("pathGlobFilter", "*.{jpeg,jpg,png,gif,bmp,JPEG,JPG,PNG,GIF,BMP}") + .option("dropInvalid", True) + .load(path) + .withColumn("path", replace_path("path")) + ) + + # Join the two datasets on the file path + df_joined = data.join( + image_bytes, data["image.origin"] == image_bytes["path"], "inner" + ) + + # Replace the `data` field of the `image` column with raw bytes + df_image_replaced = df_joined.withColumn( + "image", df_joined["image"].withField("data", df_joined["content"]) + ) + + return df_image_replaced diff --git a/python/sparknlp/common/properties.py b/python/sparknlp/common/properties.py index f5d7e55cfbdccb..7930fe1228d2f5 100644 --- a/python/sparknlp/common/properties.py +++ b/python/sparknlp/common/properties.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains classes for Annotator properties.""" +from typing import List, Dict from pyspark.ml.param import Param, Params, TypeConverters @@ -601,133 +602,637 @@ class HasGeneratorProperties: typeConverter=TypeConverters.toInt) -def setTask(self, value): - """Sets the transformer's task, e.g. ``summarize:``. + def setTask(self, value): + """Sets the transformer's task, e.g. ``summarize:``. - Parameters - ---------- - value : str - The transformer's task - """ - return self._set(task=value) + Parameters + ---------- + value : str + The transformer's task + """ + return self._set(task=value) + + + def setMinOutputLength(self, value): + """Sets minimum length of the sequence to be generated. + + Parameters + ---------- + value : int + Minimum length of the sequence to be generated + """ + return self._set(minOutputLength=value) + + + def setMaxOutputLength(self, value): + """Sets maximum length of output text. + + Parameters + ---------- + value : int + Maximum length of output text + """ + return self._set(maxOutputLength=value) + + + def setDoSample(self, value): + """Sets whether or not to use sampling, use greedy decoding otherwise. + + Parameters + ---------- + value : bool + Whether or not to use sampling; use greedy decoding otherwise + """ + return self._set(doSample=value) + + + def setTemperature(self, value): + """Sets the value used to module the next token probabilities. + + Parameters + ---------- + value : float + The value used to module the next token probabilities + """ + return self._set(temperature=value) + + + def setTopK(self, value): + """Sets the number of highest probability vocabulary tokens to keep for + top-k-filtering. + + Parameters + ---------- + value : int + Number of highest probability vocabulary tokens to keep + """ + return self._set(topK=value) -def setMinOutputLength(self, value): - """Sets minimum length of the sequence to be generated. + def setTopP(self, value): + """Sets the top cumulative probability for vocabulary tokens. - Parameters - ---------- - value : int - Minimum length of the sequence to be generated - """ - return self._set(minOutputLength=value) + If set to float < 1, only the most probable tokens with probabilities + that add up to ``topP`` or higher are kept for generation. + Parameters + ---------- + value : float + Cumulative probability for vocabulary tokens + """ + return self._set(topP=value) -def setMaxOutputLength(self, value): - """Sets maximum length of output text. - Parameters - ---------- - value : int - Maximum length of output text - """ - return self._set(maxOutputLength=value) + def setRepetitionPenalty(self, value): + """Sets the parameter for repetition penalty. 1.0 means no penalty. + Parameters + ---------- + value : float + The repetition penalty -def setDoSample(self, value): - """Sets whether or not to use sampling, use greedy decoding otherwise. + References + ---------- + See `Ctrl: A Conditional Transformer Language Model For Controllable + Generation `__ for more details. + """ + return self._set(repetitionPenalty=value) - Parameters - ---------- - value : bool - Whether or not to use sampling; use greedy decoding otherwise - """ - return self._set(doSample=value) + def setNoRepeatNgramSize(self, value): + """Sets size of n-grams that can only occur once. -def setTemperature(self, value): - """Sets the value used to module the next token probabilities. + If set to int > 0, all ngrams of that size can only occur once. - Parameters - ---------- - value : float - The value used to module the next token probabilities - """ - return self._set(temperature=value) + Parameters + ---------- + value : int + N-gram size can only occur once + """ + return self._set(noRepeatNgramSize=value) -def setTopK(self, value): - """Sets the number of highest probability vocabulary tokens to keep for - top-k-filtering. + def setBeamSize(self, value): + """Sets the number of beam size for beam search. - Parameters - ---------- - value : int - Number of highest probability vocabulary tokens to keep - """ - return self._set(topK=value) + Parameters + ---------- + value : int + Number of beam size for beam search + """ + return self._set(beamSize=value) -def setTopP(self, value): - """Sets the top cumulative probability for vocabulary tokens. + def setNReturnSequences(self, value): + """Sets the number of sequences to return from the beam search. - If set to float < 1, only the most probable tokens with probabilities - that add up to ``topP`` or higher are kept for generation. + Parameters + ---------- + value : int + Number of sequences to return + """ + return self._set(nReturnSequences=value) - Parameters - ---------- - value : float - Cumulative probability for vocabulary tokens - """ - return self._set(topP=value) +class HasLlamaCppProperties: + # -------- MODEl PARAMETERS -------- + nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation", + typeConverter=TypeConverters.toInt) + nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation", + typeConverter=TypeConverters.toInt) + nThreadsBatch = Param(Params._dummy(), "nThreadsBatch", + "Set the number of threads to use during batch and prompt processing", + typeConverter=TypeConverters.toInt) + nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft", + "Set the number of threads to use during batch and prompt processing", + typeConverter=TypeConverters.toInt) + nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt) + nBatch = Param(Params._dummy(), "nBatch", + "Set the logical batch size for prompt processing (must be >=32 to use BLAS)", + typeConverter=TypeConverters.toInt) + nUbatch = Param(Params._dummy(), "nUbatch", + "Set the physical batch size for prompt processing (must be >=32 to use BLAS)", + typeConverter=TypeConverters.toInt) + nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding", + typeConverter=TypeConverters.toInt) + nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process", + typeConverter=TypeConverters.toInt) + nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode", + typeConverter=TypeConverters.toInt) + pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability", + typeConverter=TypeConverters.toFloat) + nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)", + typeConverter=TypeConverters.toInt) + nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft", + "Set the number of layers to store in VRAM for the draft model (-1 - use default)", + typeConverter=TypeConverters.toInt) + # Set how to split the model across GPUs + # + # - NONE: No GPU split + # - LAYER: Split the model across GPUs by layer + # - ROW: Split the model across GPUs by rows + gpuSplitMode = Param(Params._dummy(), "gpuSplitMode", "Set how to split the model across GPUs", + typeConverter=TypeConverters.toString) + mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.", + typeConverter=TypeConverters.toInt) + tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs", + typeConverter=TypeConverters.toListFloat) + grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt) + grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt) + ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling", + typeConverter=TypeConverters.toFloat) + ropeFreqScale = Param(Params._dummy(), "ropeFreqScale", + "Set the RoPE frequency scaling factor, expands context by a factor of 1/N", + typeConverter=TypeConverters.toFloat) + yarnExtFactor = Param(Params._dummy(), "yarnExtFactor", "Set the YaRN extrapolation mix factor", + typeConverter=TypeConverters.toFloat) + yarnAttnFactor = Param(Params._dummy(), "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude", + typeConverter=TypeConverters.toFloat) + yarnBetaFast = Param(Params._dummy(), "yarnBetaFast", "Set the YaRN low correction dim or beta", + typeConverter=TypeConverters.toFloat) + yarnBetaSlow = Param(Params._dummy(), "yarnBetaSlow", "Set the YaRN high correction dim or alpha", + typeConverter=TypeConverters.toFloat) + yarnOrigCtx = Param(Params._dummy(), "yarnOrigCtx", "Set the YaRN original context size of model", + typeConverter=TypeConverters.toInt) + defragmentationThreshold = Param(Params._dummy(), "defragmentationThreshold", + "Set the KV cache defragmentation threshold", typeConverter=TypeConverters.toFloat) + # Set optimization strategies that help on some NUMA systems (if available) + # + # Available Strategies: + # + # - DISABLED: No NUMA optimizations + # - DISTRIBUTE: Spread execution evenly over all + # - ISOLATE: Only spawn threads on CPUs on the node that execution started on + # - NUMA_CTL: Use the CPU map provided by numactl + # - MIRROR: Mirrors the model across NUMA nodes + numaStrategy = Param(Params._dummy(), "numaStrategy", + "Set optimization strategies that help on some NUMA systems (if available)", + typeConverter=TypeConverters.toString) + # Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + # + # - UNSPECIFIED: Don't use any scaling + # - LINEAR: Linear scaling + # - YARN: YaRN RoPE scaling + ropeScalingType = Param(Params._dummy(), "ropeScalingType", + "Set the RoPE frequency scaling method, defaults to linear unless specified by the model", + typeConverter=TypeConverters.toString) + # Set the pooling type for embeddings, use model default if unspecified + # + # - 0 NONE: Don't use any pooling + # - 1 MEAN: Mean Pooling + # - 2 CLS: CLS Pooling + poolingType = Param(Params._dummy(), "poolingType", + "Set the pooling type for embeddings, use model default if unspecified", + typeConverter=TypeConverters.toString) + modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding", + typeConverter=TypeConverters.toString) + modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString) + lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath", + "Set path to static lookup cache to use for lookup decoding (not updated by generation)", + typeConverter=TypeConverters.toString) + lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath", + "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)", + typeConverter=TypeConverters.toString) + # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") + embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support", + typeConverter=TypeConverters.toBoolean) + flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention", + typeConverter=TypeConverters.toBoolean) + inputPrefixBos = Param(Params._dummy(), "inputPrefixBos", + "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string", + typeConverter=TypeConverters.toBoolean) + useMmap = Param(Params._dummy(), "useMmap", + "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)", + typeConverter=TypeConverters.toBoolean) + useMlock = Param(Params._dummy(), "useMlock", + "Whether to force the system to keep model in RAM rather than swapping or compressing", + typeConverter=TypeConverters.toBoolean) + noKvOffload = Param(Params._dummy(), "noKvOffload", "Whether to disable KV offload", + typeConverter=TypeConverters.toBoolean) + systemPrompt = Param(Params._dummy(), "systemPrompt", "Set a system prompt to use", + typeConverter=TypeConverters.toString) + chatTemplate = Param(Params._dummy(), "chatTemplate", "The chat template to use", + typeConverter=TypeConverters.toString) + + # -------- INFERENCE PARAMETERS -------- + inputPrefix = Param(Params._dummy(), "inputPrefix", "Set the prompt to start generation with", + typeConverter=TypeConverters.toString) + inputSuffix = Param(Params._dummy(), "inputSuffix", "Set a suffix for infilling", + typeConverter=TypeConverters.toString) + cachePrompt = Param(Params._dummy(), "cachePrompt", "Whether to remember the prompt to avoid reprocessing it", + typeConverter=TypeConverters.toBoolean) + nPredict = Param(Params._dummy(), "nPredict", "Set the number of tokens to predict", + typeConverter=TypeConverters.toInt) + topK = Param(Params._dummy(), "topK", "Set top-k sampling", typeConverter=TypeConverters.toInt) + topP = Param(Params._dummy(), "topP", "Set top-p sampling", typeConverter=TypeConverters.toFloat) + minP = Param(Params._dummy(), "minP", "Set min-p sampling", typeConverter=TypeConverters.toFloat) + tfsZ = Param(Params._dummy(), "tfsZ", "Set tail free sampling, parameter z", typeConverter=TypeConverters.toFloat) + typicalP = Param(Params._dummy(), "typicalP", "Set locally typical sampling, parameter p", + typeConverter=TypeConverters.toFloat) + temperature = Param(Params._dummy(), "temperature", "Set the temperature", typeConverter=TypeConverters.toFloat) + dynamicTemperatureRange = Param(Params._dummy(), "dynatempRange", "Set the dynamic temperature range", + typeConverter=TypeConverters.toFloat) + dynamicTemperatureExponent = Param(Params._dummy(), "dynatempExponent", "Set the dynamic temperature exponent", + typeConverter=TypeConverters.toFloat) + repeatLastN = Param(Params._dummy(), "repeatLastN", "Set the last n tokens to consider for penalties", + typeConverter=TypeConverters.toInt) + repeatPenalty = Param(Params._dummy(), "repeatPenalty", "Set the penalty of repeated sequences of tokens", + typeConverter=TypeConverters.toFloat) + frequencyPenalty = Param(Params._dummy(), "frequencyPenalty", "Set the repetition alpha frequency penalty", + typeConverter=TypeConverters.toFloat) + presencePenalty = Param(Params._dummy(), "presencePenalty", "Set the repetition alpha presence penalty", + typeConverter=TypeConverters.toFloat) + miroStat = Param(Params._dummy(), "miroStat", "Set MiroStat sampling strategies.", + typeConverter=TypeConverters.toString) + miroStatTau = Param(Params._dummy(), "mirostatTau", "Set the MiroStat target entropy, parameter tau", + typeConverter=TypeConverters.toFloat) + miroStatEta = Param(Params._dummy(), "mirostatEta", "Set the MiroStat learning rate, parameter eta", + typeConverter=TypeConverters.toFloat) + penalizeNl = Param(Params._dummy(), "penalizeNl", "Whether to penalize newline tokens", + typeConverter=TypeConverters.toBoolean) + nKeep = Param(Params._dummy(), "nKeep", "Set the number of tokens to keep from the initial prompt", + typeConverter=TypeConverters.toInt) + seed = Param(Params._dummy(), "seed", "Set the RNG seed", typeConverter=TypeConverters.toInt) + nProbs = Param(Params._dummy(), "nProbs", "Set the amount top tokens probabilities to output if greater than 0.", + typeConverter=TypeConverters.toInt) + minKeep = Param(Params._dummy(), "minKeep", + "Set the amount of tokens the samplers should return at least (0 = disabled)", + typeConverter=TypeConverters.toInt) + grammar = Param(Params._dummy(), "grammar", "Set BNF-like grammar to constrain generations", + typeConverter=TypeConverters.toString) + penaltyPrompt = Param(Params._dummy(), "penaltyPrompt", + "Override which part of the prompt is penalized for repetition.", + typeConverter=TypeConverters.toString) + ignoreEos = Param(Params._dummy(), "ignoreEos", + "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)", + typeConverter=TypeConverters.toBoolean) + disableTokenIds = Param(Params._dummy(), "disableTokenIds", "Set the token ids to disable in the completion", + typeConverter=TypeConverters.toListInt) + stopStrings = Param(Params._dummy(), "stopStrings", "Set strings upon seeing which token generation is stopped", + typeConverter=TypeConverters.toListString) + samplers = Param(Params._dummy(), "samplers", "Set which samplers to use for token generation in the given order", + typeConverter=TypeConverters.toListString) + useChatTemplate = Param(Params._dummy(), "useChatTemplate", + "Set whether or not generate should apply a chat template", + typeConverter=TypeConverters.toBoolean) + + # -------- MODEL SETTERS -------- + def setNThreads(self, nThreads: int): + """Set the number of threads to use during generation""" + return self._set(nThreads=nThreads) + + def setNThreadsDraft(self, nThreadsDraft: int): + """Set the number of threads to use during draft generation""" + return self._set(nThreadsDraft=nThreadsDraft) + + def setNThreadsBatch(self, nThreadsBatch: int): + """Set the number of threads to use during batch and prompt processing""" + return self._set(nThreadsBatch=nThreadsBatch) + + def setNThreadsBatchDraft(self, nThreadsBatchDraft: int): + """Set the number of threads to use during batch and prompt processing""" + return self._set(nThreadsBatchDraft=nThreadsBatchDraft) + + def setNCtx(self, nCtx: int): + """Set the size of the prompt context""" + return self._set(nCtx=nCtx) + + def setNBatch(self, nBatch: int): + """Set the logical batch size for prompt processing (must be >=32 to use BLAS)""" + return self._set(nBatch=nBatch) + + def setNUbatch(self, nUbatch: int): + """Set the physical batch size for prompt processing (must be >=32 to use BLAS)""" + return self._set(nUbatch=nUbatch) + + def setNDraft(self, nDraft: int): + """Set the number of tokens to draft for speculative decoding""" + return self._set(nDraft=nDraft) + + def setNChunks(self, nChunks: int): + """Set the maximal number of chunks to process""" + return self._set(nChunks=nChunks) + + def setNSequences(self, nSequences: int): + """Set the number of sequences to decode""" + return self._set(nSequences=nSequences) + + def setPSplit(self, pSplit: float): + """Set the speculative decoding split probability""" + return self._set(pSplit=pSplit) + + def setNGpuLayers(self, nGpuLayers: int): + """Set the number of layers to store in VRAM (-1 - use default)""" + return self._set(nGpuLayers=nGpuLayers) + + def setNGpuLayersDraft(self, nGpuLayersDraft: int): + """Set the number of layers to store in VRAM for the draft model (-1 - use default)""" + return self._set(nGpuLayersDraft=nGpuLayersDraft) + + def setGpuSplitMode(self, gpuSplitMode: str): + """Set how to split the model across GPUs""" + return self._set(gpuSplitMode=gpuSplitMode) + + def setMainGpu(self, mainGpu: int): + """Set the main GPU that is used for scratch and small tensors.""" + return self._set(mainGpu=mainGpu) + + def setTensorSplit(self, tensorSplit: List[float]): + """Set how split tensors should be distributed across GPUs""" + return self._set(tensorSplit=tensorSplit) + + def setGrpAttnN(self, grpAttnN: int): + """Set the group-attention factor""" + return self._set(grpAttnN=grpAttnN) + + def setGrpAttnW(self, grpAttnW: int): + """Set the group-attention width""" + return self._set(grpAttnW=grpAttnW) + + def setRopeFreqBase(self, ropeFreqBase: float): + """Set the RoPE base frequency, used by NTK-aware scaling""" + return self._set(ropeFreqBase=ropeFreqBase) + + def setRopeFreqScale(self, ropeFreqScale: float): + """Set the RoPE frequency scaling factor, expands context by a factor of 1/N""" + return self._set(ropeFreqScale=ropeFreqScale) + + def setYarnExtFactor(self, yarnExtFactor: float): + """Set the YaRN extrapolation mix factor""" + return self._set(yarnExtFactor=yarnExtFactor) + + def setYarnAttnFactor(self, yarnAttnFactor: float): + """Set the YaRN scale sqrt(t) or attention magnitude""" + return self._set(yarnAttnFactor=yarnAttnFactor) + + def setYarnBetaFast(self, yarnBetaFast: float): + """Set the YaRN low correction dim or beta""" + return self._set(yarnBetaFast=yarnBetaFast) + + def setYarnBetaSlow(self, yarnBetaSlow: float): + """Set the YaRN high correction dim or alpha""" + return self._set(yarnBetaSlow=yarnBetaSlow) + + def setYarnOrigCtx(self, yarnOrigCtx: int): + """Set the YaRN original context size of model""" + return self._set(yarnOrigCtx=yarnOrigCtx) + + def setDefragmentationThreshold(self, defragmentationThreshold: float): + """Set the KV cache defragmentation threshold""" + return self._set(defragmentationThreshold=defragmentationThreshold) -def setRepetitionPenalty(self, value): - """Sets the parameter for repetition penalty. 1.0 means no penalty. + def setNumaStrategy(self, numaStrategy: str): + """Set optimization strategies that help on some NUMA systems (if available)""" + numaUpper = numaStrategy.upper() + numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"] + if numaUpper not in numaStrategies: + raise ValueError( + f"Invalid NUMA strategy: {numaUpper}. " + + f"Valid values are: {numaStrategies}" + ) + return self._set(numaStrategy=numaStrategy) + + def setRopeScalingType(self, ropeScalingType: str): + """Set the RoPE frequency scaling method, defaults to linear unless specified by the model""" + return self._set(ropeScalingType=ropeScalingType) + + def setPoolingType(self, poolingType: str): + """Set the pooling type for embeddings, use model default if unspecified""" + poolingTypeUpper = poolingType.upper() + poolingTypes = ["NONE", "MEAN", "CLS", "LAST"] + if poolingTypeUpper not in poolingTypes: + raise ValueError( + f"Invalid pooling type: {poolingType}. " + + f"Valid values are: {poolingTypes}" + ) + return self._set(poolingType=poolingType) + + def setModelDraft(self, modelDraft: str): + """Set the draft model for speculative decoding""" + return self._set(modelDraft=modelDraft) + + def setModelAlias(self, modelAlias: str): + """Set a model alias""" + return self._set(modelAlias=modelAlias) + + def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str): + """Set path to static lookup cache to use for lookup decoding (not updated by generation)""" + return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath) + + def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str): + """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)""" + return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath) + + def setEmbedding(self, embedding: bool): + """Whether to load model with embedding support""" + return self._set(embedding=embedding) + + def setFlashAttention(self, flashAttention: bool): + """Whether to enable Flash Attention""" + return self._set(flashAttention=flashAttention) + + def setInputPrefixBos(self, inputPrefixBos: bool): + """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` bool""" + return self._set(inputPrefixBos=inputPrefixBos) + + def setUseMmap(self, useMmap: bool): + """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)""" + return self._set(useMmap=useMmap) + + def setUseMlock(self, useMlock: bool): + """Whether to force the system to keep model in RAM rather than swapping or compressing""" + return self._set(useMlock=useMlock) + + def setNoKvOffload(self, noKvOffload: bool): + """Whether to disable KV offload""" + return self._set(noKvOffload=noKvOffload) + + def setSystemPrompt(self, systemPrompt: bool): + """Set a system prompt to use""" + return self._set(systemPrompt=systemPrompt) + + def setChatTemplate(self, chatTemplate: str): + """The chat template to use""" + return self._set(chatTemplate=chatTemplate) + + # -------- INFERENCE SETTERS -------- + def setInputPrefix(self, inputPrefix: str): + """Set the prompt to start generation with""" + return self._set(inputPrefix=inputPrefix) - Parameters - ---------- - value : float - The repetition penalty + def setInputSuffix(self, inputSuffix: str): + """Set a suffix for infilling""" + return self._set(inputSuffix=inputSuffix) - References - ---------- - See `Ctrl: A Conditional Transformer Language Model For Controllable - Generation `__ for more details. - """ - return self._set(repetitionPenalty=value) + def setCachePrompt(self, cachePrompt: bool): + """Whether to remember the prompt to avoid reprocessing it""" + return self._set(cachePrompt=cachePrompt) + def setNPredict(self, nPredict: int): + """Set the number of tokens to predict""" + return self._set(nPredict=nPredict) -def setNoRepeatNgramSize(self, value): - """Sets size of n-grams that can only occur once. + def setTopK(self, topK: int): + """Set top-k sampling""" + return self._set(topK=topK) - If set to int > 0, all ngrams of that size can only occur once. + def setTopP(self, topP: float): + """Set top-p sampling""" + return self._set(topP=topP) - Parameters - ---------- - value : int - N-gram size can only occur once - """ - return self._set(noRepeatNgramSize=value) + def setMinP(self, minP: float): + """Set min-p sampling""" + return self._set(minP=minP) + + def setTfsZ(self, tfsZ: float): + """Set tail free sampling, parameter z""" + return self._set(tfsZ=tfsZ) + + def setTypicalP(self, typicalP: float): + """Set locally typical sampling, parameter p""" + return self._set(typicalP=typicalP) + + def setTemperature(self, temperature: float): + """Set the temperature""" + return self._set(temperature=temperature) + + def setDynamicTemperatureRange(self, dynamicTemperatureRange: float): + """Set the dynamic temperature range""" + return self._set(dynamicTemperatureRange=dynamicTemperatureRange) + + def setDynamicTemperatureExponent(self, dynamicTemperatureExponent: float): + """Set the dynamic temperature exponent""" + return self._set(dynamicTemperatureExponent=dynamicTemperatureExponent) + + def setRepeatLastN(self, repeatLastN: int): + """Set the last n tokens to consider for penalties""" + return self._set(repeatLastN=repeatLastN) + + def setRepeatPenalty(self, repeatPenalty: float): + """Set the penalty of repeated sequences of tokens""" + return self._set(repeatPenalty=repeatPenalty) + + def setFrequencyPenalty(self, frequencyPenalty: float): + """Set the repetition alpha frequency penalty""" + return self._set(frequencyPenalty=frequencyPenalty) + + def setPresencePenalty(self, presencePenalty: float): + """Set the repetition alpha presence penalty""" + return self._set(presencePenalty=presencePenalty) + + def setMiroStat(self, miroStat: str): + """Set MiroStat sampling strategies.""" + return self._set(miroStat=miroStat) + + def setMiroStatTau(self, miroStatTau: float): + """Set the MiroStat target entropy, parameter tau""" + return self._set(miroStatTau=miroStatTau) + + def setMiroStatEta(self, miroStatEta: float): + """Set the MiroStat learning rate, parameter eta""" + return self._set(miroStatEta=miroStatEta) + + def setPenalizeNl(self, penalizeNl: bool): + """Whether to penalize newline tokens""" + return self._set(penalizeNl=penalizeNl) + + def setNKeep(self, nKeep: int): + """Set the number of tokens to keep from the initial prompt""" + return self._set(nKeep=nKeep) + + def setSeed(self, seed: int): + """Set the RNG seed""" + return self._set(seed=seed) + + def setNProbs(self, nProbs: int): + """Set the amount top tokens probabilities to output if greater than 0.""" + return self._set(nProbs=nProbs) + + def setMinKeep(self, minKeep: int): + """Set the amount of tokens the samplers should return at least (0 = disabled)""" + return self._set(minKeep=minKeep) + + def setGrammar(self, grammar: bool): + """Set BNF-like grammar to constrain generations""" + return self._set(grammar=grammar) + + def setPenaltyPrompt(self, penaltyPrompt: str): + """Override which part of the prompt is penalized for repetition.""" + return self._set(penaltyPrompt=penaltyPrompt) + def setIgnoreEos(self, ignoreEos: bool): + """Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)""" + return self._set(ignoreEos=ignoreEos) -def setBeamSize(self, value): - """Sets the number of beam size for beam search. + def setDisableTokenIds(self, disableTokenIds: List[int]): + """Set the token ids to disable in the completion""" + return self._set(disableTokenIds=disableTokenIds) - Parameters - ---------- - value : int - Number of beam size for beam search - """ - return self._set(beamSize=value) + def setStopStrings(self, stopStrings: List[str]): + """Set strings upon seeing which token generation is stopped""" + return self._set(stopStrings=stopStrings) + def setSamplers(self, samplers: List[str]): + """Set which samplers to use for token generation in the given order""" + return self._set(samplers=samplers) -def setNReturnSequences(self, value): - """Sets the number of sequences to return from the beam search. + def setUseChatTemplate(self, useChatTemplate: bool): + """Set whether generate should apply a chat template""" + return self._set(useChatTemplate=useChatTemplate) - Parameters - ---------- - value : int - Number of sequences to return - """ - return self._set(nReturnSequences=value) + # -------- JAVA SETTERS -------- + def setTokenIdBias(self, tokenIdBias: Dict[int, float]): + """Set token id bias""" + return self._call_java("setTokenIdBias", tokenIdBias) + + def setTokenBias(self, tokenBias: Dict[str, float]): + """Set token id bias""" + return self._call_java("setTokenBias", tokenBias) + + def setLoraAdapters(self, loraAdapters: Dict[str, float]): + """Set token id bias""" + return self._call_java("setLoraAdapters", loraAdapters) + + def getMetadata(self): + """Gets the metadata of the model""" + return self._call_java("getMetadata") diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index 4cb5321e8a8691..44504cb749151b 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -992,7 +992,7 @@ class _AutoGGUFLoader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_AutoGGUFLoader, self).__init__( "com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel.loadSavedModel", path, jspark) - + class _MxbaiEmbeddingsLoader(ExtendedJavaWrapper): def __init__(self, path, jspark): @@ -1021,3 +1021,9 @@ def __init__(self, path, jspark): path, jspark, ) + + +class _AutoGGUFVisionLoader(ExtendedJavaWrapper): + def __init__(self, modelPath, mmprojPath, jspark): + super(_AutoGGUFVisionLoader, self).__init__( + "com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFVisionModel.loadSavedModel", modelPath, mmprojPath, jspark) diff --git a/python/test/annotator/seq2seq/auto_gguf_model_test.py b/python/test/annotator/seq2seq/auto_gguf_model_test.py index e6553bc509e5ff..76ad82c21d0814 100644 --- a/python/test/annotator/seq2seq/auto_gguf_model_test.py +++ b/python/test/annotator/seq2seq/auto_gguf_model_test.py @@ -102,7 +102,6 @@ def runTest(self): model.setGpuSplitMode("NONE") model.setMainGpu(0) model.setTensorSplit([]) - model.setNBeams(0) model.setGrpAttnN(1) model.setGrpAttnW(512) model.setRopeFreqBase(1.0) @@ -115,11 +114,10 @@ def runTest(self): model.setDefragmentationThreshold(-1.0) model.setNumaStrategy("DISTRIBUTE") model.setRopeScalingType("UNSPECIFIED") - model.setPoolingType("UNSPECIFIED") + model.setPoolingType("NONE") model.setModelDraft("") model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache") model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache") - model.setLoraBase("") model.setEmbedding(False) model.setFlashAttention(False) model.setInputPrefixBos(False) @@ -171,6 +169,7 @@ def runTest(self): pipeline = Pipeline().setStages([document_assembler, model]) results = pipeline.fit(data).transform(data) + # Can fail due to bogus parameters, but at least we are testing the setters results.select("completions").show(truncate=False) diff --git a/python/test/annotator/seq2seq/auto_gguf_vision_model_test.py b/python/test/annotator/seq2seq/auto_gguf_vision_model_test.py new file mode 100644 index 00000000000000..c0509a59841ba7 --- /dev/null +++ b/python/test/annotator/seq2seq/auto_gguf_vision_model_test.py @@ -0,0 +1,86 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest +from pyspark.sql.functions import lit + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class AutoGGUFVisionModelTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + + def runTest(self): + documentAssembler = ( + DocumentAssembler().setInputCol("caption").setOutputCol("caption_document") + ) + imageAssembler = ( + ImageAssembler().setInputCol("image").setOutputCol("image_assembler") + ) + imagesPath = "../src/test/resources/image/" + data = ImageAssembler.loadImagesAsBytes(self.spark, imagesPath).withColumn( + "caption", lit("Caption this image.") + ) # Add a caption to each image. + nPredict = 40 + model = ( + AutoGGUFVisionModel.pretrained() + .setInputCols(["caption_document", "image_assembler"]) + .setOutputCol("completions") + .setChatTemplate("vicuna") + .setBatchSize(4) + .setNGpuLayers(99) + .setNCtx(4096) + .setMinKeep(0) + .setMinP(0.05) + .setNPredict(nPredict) + .setNProbs(0) + .setPenalizeNl(False) + .setRepeatLastN(256) + .setRepeatPenalty(1.18) + .setStopStrings(["", "Llama:", "User:"]) + .setTemperature(0.05) + .setTfsZ(1) + .setTypicalP(1) + .setTopK(40) + .setTopP(0.95) + ) + pipeline = Pipeline().setStages([documentAssembler, imageAssembler, model]) + # pipeline.fit(data).transform(data).selectExpr( + # "reverse(split(image.origin, '/'))[0] as image_name", "completions.result" + # ).show(truncate=False) + + results = pipeline.fit(data).transform(data).collect() + + expectedWords = { + "bluetick.jpg": "dog", + "chihuahua.jpg": "dog", + "egyptian_cat.jpeg": "cat", + "hen.JPEG": "chick", + "hippopotamus.JPEG": "hippo", + "junco.JPEG": "bird", + "ostrich.JPEG": "ostrich", + "ox.JPEG": "bull", + "palace.JPEG": "room", + "tractor.JPEG": "tractor", + } + + for result in results: + image_name = result["image_assembler"][0]["origin"].split("/")[-1] + completion = result["completions"][0]["result"] + assert expectedWords[image_name] in completion, f"Expected '{expectedWords[image_name]}' in '{completion}'" diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala index ef7091c3b5cd12..6f68ead3a51ef0 100644 --- a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala +++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala @@ -77,6 +77,7 @@ object GGUFWrapper { new LlamaModel(modelParameters) } + /** Reads the GGUF model from file during loadSavedModel. */ def read(sparkSession: SparkSession, modelPath: String): GGUFWrapper = { // TODO Better Sanity Check val modelFile = new File(modelPath) @@ -92,6 +93,9 @@ object GGUFWrapper { new GGUFWrapper(modelFile.getName, modelFile.getParent) } + /** Reads the GGUF model from the folder passed by the Spark Reader during loading of a + * serialized model. + */ def readModel(modelFolderPath: String, spark: SparkSession): GGUFWrapper = { def findGGUFModelInFolder(folderPath: String): String = { val folder = new File(folderPath) diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala new file mode 100644 index 00000000000000..89eb8f517360f2 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala @@ -0,0 +1,149 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.ml.gguf + +import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters} +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.SparkFiles +import org.apache.spark.sql.SparkSession + +import java.io.File +import java.nio.file.{Files, Paths} + +class GGUFWrapperMultiModal(var modelFileName: String, var mmprojFileName: String) + extends Serializable { + + /** For Deserialization */ + def this() = { + this(null, null) + } + + // Important for serialization on none-kryo serializers + @transient private var llamaModel: LlamaModel = _ + + def getSession(modelParameters: ModelParameters): LlamaModel = + this.synchronized { + if (llamaModel == null) { + val modelFilePath = SparkFiles.get(modelFileName) + val mmprojFilePath = SparkFiles.get(mmprojFileName) + val filesExist = + Paths.get(modelFilePath).toFile.exists() && Paths.get(mmprojFilePath).toFile.exists() + + if (filesExist) { + modelParameters.setModelFilePath(modelFilePath) + modelParameters.setMMProj(mmprojFilePath) + llamaModel = GGUFWrapperMultiModal.withSafeGGUFModelLoader(modelParameters) + } else + throw new IllegalStateException( + s"Model file $modelFileName does not exist in SparkFiles.") + } + // TODO: if the model is already loaded then the model parameters will not apply. perhaps output a logline here. + llamaModel + } + + def saveToFile(folder: String): Unit = { + val modelFilePath = SparkFiles.get(modelFileName) + val mmprojFilePath = SparkFiles.get(mmprojFileName) + val modelOutputPath = Paths.get(folder, modelFileName) + val mmprojOutputPath = Paths.get(folder, mmprojFileName) + Files.copy(Paths.get(modelFilePath), modelOutputPath) + Files.copy(Paths.get(mmprojFilePath), mmprojOutputPath) + } + + // Destructor to free the model when this object is garbage collected + override def finalize(): Unit = { + if (llamaModel != null) { + llamaModel.close() + } + } + +} + +/** Companion object */ +object GGUFWrapperMultiModal { + private def withSafeGGUFModelLoader(modelParameters: ModelParameters): LlamaModel = + this.synchronized { + new LlamaModel(modelParameters) + } + + /** Reads the GGUF model from file during loadSavedModel. */ + def read( + sparkSession: SparkSession, + modelPath: String, + mmprojPath: String): GGUFWrapperMultiModal = { + val modelFile = new File(modelPath) + val mmprojFile = new File(mmprojPath) + + if (!modelFile.getName.endsWith(".gguf")) + throw new IllegalArgumentException(s"Model file $modelPath is not a GGUF model file") + + if (!mmprojFile.getName.endsWith(".gguf")) + throw new IllegalArgumentException(s"mmproj file $mmprojPath is not a GGUF model file") + + if (!mmprojFile.getName.contains("mmproj")) + throw new IllegalArgumentException( + s"mmproj file $mmprojPath is not a GGUF mmproj file (should contain 'mmproj' in its name)") + + if (modelFile.exists() && mmprojFile.exists()) { + sparkSession.sparkContext.addFile(modelPath) + sparkSession.sparkContext.addFile(mmprojPath) + } else + throw new IllegalArgumentException( + s"Model file $modelPath or mmproj file $mmprojPath does not exist") + + new GGUFWrapperMultiModal(modelFile.getName, mmprojFile.getName) + } + + /** Reads the GGUF model from the folder passed by the Spark Reader during loading of a + * serialized model. + */ + def readModel(modelFolderPath: String, spark: SparkSession): GGUFWrapperMultiModal = { + def findGGUFModelsInFolder(folderPath: String): (String, String) = { + val folder = new File(folderPath) + if (folder.exists && folder.isDirectory) { + val ggufFiles: Array[String] = folder.listFiles + .filter(_.isFile) + .filter(_.getName.endsWith(".gguf")) + .map(_.getAbsolutePath) + + val (ggufMainPath, ggufMmprojPath) = + if (ggufFiles.length == 2 && ggufFiles.exists(_.contains("mmproj"))) { + val Array(firstModel, secondModel) = ggufFiles + if (firstModel.contains("mmproj")) (secondModel, firstModel) + else (firstModel, secondModel) + } else + throw new IllegalArgumentException( + s"Could not determine main GGUF model or mmproj GGUF model in $folderPath." + + s" The folder should contain exactly two files:" + + s" One main GGUF model and one mmproj GGUF model." + + s" The mmproj model should have 'mmproj' in its name.") + + (ggufMainPath, ggufMmprojPath) + } else { + throw new IllegalArgumentException(s"Path $folderPath is not a directory") + } + } + + val uri = new java.net.URI(modelFolderPath.replaceAllLiterally("\\", "/")) + // In case the path belongs to a different file system but doesn't have the scheme prepended (e.g. dbfs) + val fileSystem: FileSystem = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) + val actualFolderPath = fileSystem.resolvePath(new Path(modelFolderPath)).toString + val localFolder = ResourceHelper.copyToLocal(actualFolderPath) + val (ggufMainPath, ggufMmprojPath) = findGGUFModelsInFolder(localFolder) + read(spark, ggufMainPath, ggufMmprojPath) + } +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index 1a350c750fc958..e1e75926a89ffa 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -111,6 +111,23 @@ abstract class AnnotatorModel[M <: Model[M]] extends RawAnnotator[M] with CanBeL }) .withColumn(getOutputCol, wrapColumnMetadata(col(getOutputCol))) dfWithMetadata + + case withBatchAnnotateTextImage: HasBatchedAnnotateTextImage[M] => + implicit val encoder: ExpressionEncoder[Row] = + SparkNlpConfig.getEncoder(inputDataset, newStructType) + val processedDataFrame = inputDataset.mapPartitions(partition => { + withBatchAnnotateTextImage.batchProcess(partition) + }) + + // TODO: Do we really need to repeat this in every case? + /** Put back column metadata from `inputDataset` after destructive mapPartitions */ + val dfWithMetadata = inputDataset.schema.fields + .foldLeft(processedDataFrame)((dataFrame, field) => { + dataFrame + .withColumn(field.name, dataFrame.col(field.name).as(field.name, field.metadata)) + }) + .withColumn(getOutputCol, wrapColumnMetadata(col(getOutputCol))) + dfWithMetadata } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateTextImage.scala b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateTextImage.scala new file mode 100644 index 00000000000000..6881e74dd12510 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateTextImage.scala @@ -0,0 +1,98 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp + +import org.apache.spark.ml.Model +import org.apache.spark.ml.param.IntParam +import org.apache.spark.sql.Row + +trait HasBatchedAnnotateTextImage[M <: Model[M]] { + + this: RawAnnotator[M] => + + /** Size of every batch (Default depends on model). + * + * @group param + */ + val batchSize = new IntParam(this, "batchSize", "Size of every batch.") + + /** Size of every batch. + * + * @group setParam + */ + def setBatchSize(size: Int): this.type = { + val recommended = size + require(recommended > 0, "batchSize must be greater than 0") + set(this.batchSize, recommended) + } + + /** Size of every batch. + * + * @group getParam + */ + def getBatchSize: Int = $(batchSize) + + private def getCaptionImageAnnotations(row: Row): (Annotation, AnnotationImage) = { + require( + getInputCols.length == 2, + "Only two input columns are allowed for this annotator:" + + " One for text caption and one for image.") + + // Assuming we only have one annotation per field + val inputAnnotations: Array[Row] = + getInputCols.map(row.fieldIndex).map(i => row.getAs[Seq[Row]](i).head) + + val (documentStruct: Row, imageStruct: Row) = + if (inputAnnotations.head.getString(0) == AnnotatorType.DOCUMENT) { + (inputAnnotations.head, inputAnnotations.last) + } else { + (inputAnnotations.last, inputAnnotations.head) + } + + val document = Annotation(documentStruct) + val image = AnnotationImage(imageStruct) + (document, image) + } + + def batchProcess(rows: Iterator[_]): Iterator[Row] = { + rows + .grouped(getBatchSize) + .flatMap { case batchedRows: Seq[Row] => + val inputAnnotations: Seq[(Annotation, AnnotationImage)] = + batchedRows.map(getCaptionImageAnnotations) + val outputAnnotations = batchAnnotate(inputAnnotations) + + batchedRows.zip(outputAnnotations).map { case (row, annotations) => + row.toSeq ++ Array(annotations.map(a => Row(a.productIterator.toSeq: _*))) + } + } + .map(Row.fromSeq) + } + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations in batches that correspond to inputAnnotationCols generated by previous + * annotators if any + * @return + * any number of annotations processed for every batch of input annotations. Not necessary + * one to one relationship + */ + def batchAnnotate(batchedAnnotations: Seq[(Annotation, AnnotationImage)]): Seq[Seq[Annotation]] + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala index 73b08bae40d695..ae620dc78cbaa5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala @@ -22,9 +22,9 @@ import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.expressions.UserDefinedFunction -import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.functions.{col, regexp_replace, udf} import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} /** Prepares images read by Spark into a format that is processable by Spark NLP. This component * is needed to process images. @@ -213,4 +213,49 @@ private[nlp] case class ImageFields( /** This is the companion object of [[ImageAssembler]]. Please refer to that class for the * documentation. */ -object ImageAssembler extends DefaultParamsReadable[ImageAssembler] +object ImageAssembler extends DefaultParamsReadable[ImageAssembler] { + + /** Helper function that loads images from a path and returns them as raw bytes, instead of the + * default OpenCV compatible format. + * + * Supported image types are JPEG, PNG, GIF, BMP (limited to images supported by stb_image.h). + * + * Multimodal inference with llama.cpp requires raw bytes as input. + * + * @param spark + * The SparkSession + * @param path + * The path to the images. Supported image types are JPEG, PNG, GIF, BMP. + * @return + * A dataframe with the images as raw bytes, as well as their metadata. + */ + def loadImagesAsBytes(spark: SparkSession, path: String): DataFrame = { + // Replace the path separator in the `origin` field and `path` column, so that they match + def replacePath(columnName: String) = regexp_replace(col(columnName), ":///", ":/") + + val data: DataFrame = + spark.read + .format("image") + .option("dropInvalid", value = true) + .load(path) + .withColumn("image", col("image").withField("origin", replacePath("image.origin"))) + + val imageBytes: DataFrame = + spark.read + .format("binaryFile") + .option("pathGlobFilter", "*.{jpeg,jpg,png,gif,bmp,JPEG,JPG,PNG,GIF,BMP}") + .option("dropInvalid", value = true) + .load(path) + .withColumn("path", replacePath("path")) + + // Join on path + val dfJoined = + data.join(imageBytes, data("image.origin") === imageBytes("path"), "inner") + + // Replace image column data with image bytes + val dfImageReplaced = + dfJoined.withColumn("image", dfJoined("image").withField("data", dfJoined("content"))) + + dfImageReplaced + } +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index efbd3a288896c1..e88f5feaa9fb01 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -833,4 +833,8 @@ package object annotator { extends ReadablePretrainedAutoGGUFEmbeddings with ReadAutoGGUFEmbeddings + type AutoGGUFVisionModel = com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFVisionModel + object AutoGGUFVisionModel + extends ReadablePretrainedAutoGGUFVisionModel + with ReadAutoGGUFVisionModel } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/util/io/ImageIOUtils.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/util/io/ImageIOUtils.scala index ca5be6ba37dfdb..5bdafeca1b29e3 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/util/io/ImageIOUtils.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/util/io/ImageIOUtils.scala @@ -67,20 +67,18 @@ private[johnsnowlabs] object ImageIOUtils { def readImage(file: File): Option[BufferedImage] = { Try(ImageIO.read(file)) match { case Success(bufferedImage) => Some(bufferedImage) - case Failure(_) => { + case Failure(_) => logger.warn(s"Error in ImageIOUtils.readImage while reading file: ${file.getPath}") None - } } } def readImage(inputStream: InputStream): Option[BufferedImage] = { Try(ImageIO.read(inputStream)) match { case Success(bufferedImage) => Some(bufferedImage) - case Failure(_) => { + case Failure(_) => logger.warn(s"Error in ImageIOUtils.readImage while reading inputStream") None - } } } @@ -203,4 +201,7 @@ private[johnsnowlabs] object ImageIOUtils { } + def encodeImageBase64(image: Array[Byte]): String = + java.util.Base64.getEncoder.encodeToString(image) + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala new file mode 100644 index 00000000000000..f3de739613ece3 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala @@ -0,0 +1,326 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.nlp.annotators.seq2seq + +import com.johnsnowlabs.ml.gguf.GGUFWrapperMultiModal +import com.johnsnowlabs.ml.util.LlamaCPP +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.cv.util.io.ImageIOUtils +import com.johnsnowlabs.nlp.llama.{LlamaException, LlamaModel} +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +/** Multimodal annotator that uses the llama.cpp library to generate text completions with large + * language models. It supports ingesting images for captioning. + * + * At the moment only CLIP based models are supported. + * + * For settable parameters, and their explanations, see [[HasLlamaCppInferenceProperties]], + * [[HasLlamaCppModelProperties]] and refer to the llama.cpp documentation of + * [[https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server server.cpp]] + * for more information. + * + * If the parameters are not set, the annotator will default to use the parameters provided by + * the model. + * + * This annotator expects a column of annotator type [[AnnotationImage]] for the image and + * [[Annotation]] for the caption. Note that the image bytes in the image annotation need to be + * raw image bytes without preprocessing. We provide the helper function + * [[ImageAssembler.loadImagesAsBytes]] to load the image bytes from a directory. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val autoGGUFVisionModel = AutoGGUFVisionModel.pretrained() + * .setInputCols("image', "document") + * .setOutputCol("completions") + * }}} + * The default model is `"llava_v1.5_7b_Q4_0_gguf"`, if no name is provided. + * + * For available pretrained models please see the [[https://sparknlp.org/models Models Hub]]. + * + * For extended examples of usage, see the + * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTest.scala AutoGGUFVisionModelTest]] + * and the + * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFVisionModel.ipynb example notebook]]. + * + * ==Note== + * To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set + * the number of GPU layers with the `setNGpuLayers` method. + * + * When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` + * according to your hardware to avoid out-of-memory errors. + * + * ==Example== + * + * {{{ + * import com.johnsnowlabs.nlp.ImageAssembler + * import com.johnsnowlabs.nlp.annotator._ + * import com.johnsnowlabs.nlp.base._ + * import org.apache.spark.ml.Pipeline + * import org.apache.spark.sql.DataFrame + * import org.apache.spark.sql.functions.lit + * + * val documentAssembler = new DocumentAssembler() + * .setInputCol("caption") + * .setOutputCol("caption_document") + * + * val imageAssembler = new ImageAssembler() + * .setInputCol("image") + * .setOutputCol("image_assembler") + * + * val imagesPath = "src/test/resources/image/" + * val data: DataFrame = ImageAssembler + * .loadImagesAsBytes(ResourceHelper.spark, imagesPath) + * .withColumn("caption", lit("Caption this image.")) // Add a caption to each image. + * + * val nPredict = 40 + * val model = AutoGGUFVisionModel.pretrained() + * .setInputCols("caption_document", "image_assembler") + * .setOutputCol("completions") + * .setBatchSize(4) + * .setNGpuLayers(99) + * .setNCtx(4096) + * .setMinKeep(0) + * .setMinP(0.05f) + * .setNPredict(nPredict) + * .setNProbs(0) + * .setPenalizeNl(false) + * .setRepeatLastN(256) + * .setRepeatPenalty(1.18f) + * .setStopStrings(Array("", "Llama:", "User:")) + * .setTemperature(0.05f) + * .setTfsZ(1) + * .setTypicalP(1) + * .setTopK(40) + * .setTopP(0.95f) + * + * val pipeline = new Pipeline().setStages(Array(documentAssembler, imageAssembler, model)) + * pipeline + * .fit(data) + * .transform(data) + * .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "completions.result") + * .show(truncate = false) + * +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + * |image_name |result | + * +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + * |palace.JPEG |[ The image depicts a large, ornate room with high ceilings and beautifully decorated walls. There are several chairs placed throughout the space, some of which have cushions] | + * |egyptian_cat.jpeg|[ The image features two cats lying on a pink surface, possibly a bed or sofa. One cat is positioned towards the left side of the scene and appears to be sleeping while holding] | + * |hippopotamus.JPEG|[ A large brown hippo is swimming in a body of water, possibly an aquarium. The hippo appears to be enjoying its time in the water and seems relaxed as it floats] | + * |hen.JPEG |[ The image features a large chicken standing next to several baby chickens. In total, there are five birds in the scene: one adult and four young ones. They appear to be gathered together] | + * |ostrich.JPEG |[ The image features a large, long-necked bird standing in the grass. It appears to be an ostrich or similar species with its head held high and looking around. In addition to] | + * |junco.JPEG |[ A small bird with a black head and white chest is standing on the snow. It appears to be looking at something, possibly food or another animal in its vicinity. The scene takes place out] | + * |bluetick.jpg |[ A dog with a red collar is sitting on the floor, looking at something. The dog appears to be staring into the distance or focusing its attention on an object in front of it.] | + * |chihuahua.jpg |[ A small brown dog wearing a sweater is sitting on the floor. The dog appears to be looking at something, possibly its owner or another animal in the room. It seems comfortable and relaxed]| + * |tractor.JPEG |[ A man is sitting in the driver's seat of a green tractor, which has yellow wheels and tires. The tractor appears to be parked on top of an empty field with] | + * |ox.JPEG |[ A large bull with horns is standing in a grassy field.] | + * +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + * }}} + * + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class AutoGGUFVisionModel(override val uid: String) + extends AnnotatorModel[AutoGGUFVisionModel] + with HasBatchedAnnotateTextImage[AutoGGUFVisionModel] + with HasEngine + with HasLlamaCppModelProperties + with HasLlamaCppInferenceProperties + with HasProtectedParams { + + override val inputAnnotatorTypes: Array[AnnotatorType] = + Array(AnnotatorType.IMAGE, AnnotatorType.DOCUMENT) + override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("AutoGGUFVisionModel")) + + private var _model: Option[Broadcast[GGUFWrapperMultiModal]] = None + + /** @group getParam */ + def getModelIfNotSet: GGUFWrapperMultiModal = _model.get.value + + /** @group setParam */ + def setModelIfNotSet(spark: SparkSession, wrapper: GGUFWrapperMultiModal): this.type = { + if (_model.isEmpty) { + _model = Some(spark.sparkContext.broadcast(wrapper)) + } + + // Entrypoint for models. Automatically set GPU support if detected. + setGpuSupportIfAvailable(spark) + this + } + + private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName) + + setDefault( + engine -> LlamaCPP.name, + useChatTemplate -> true, + nCtx -> 4096, + nBatch -> 512, + embedding -> false, + nPredict -> 100) + +// val mmproj = new Param[String]( +// this, +// "mmproj", +// "Name of the file for the multi-modal projection (mmproj) model, that encodes the images.") +// +// /** Sets the path to the multi-modal projection (mmproj) model, that encodes the images. +// * +// * Should only be used by this class and not by the user. +// * +// * @param value +// * Name of the file for the multi-modal projection (mmproj) model +// * @return +// */ +// private def setMmproj(value: String): this.type = set(mmproj, value) +// +// private def getMmproj: String = $(mmproj) + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + getModelIfNotSet.saveToFile(path) + } + + /** Completes the batch of annotations. + * + * @param batchedAnnotations + * The single batch of annotations + * @return + * Completed text sequences + * + * sentences that belong to the same original row !! (challenging) + */ + override def batchAnnotate( + batchedAnnotations: Seq[(Annotation, AnnotationImage)]): Seq[Seq[Annotation]] = { + if (batchedAnnotations.nonEmpty) { + + // set parallel decoding to batch size + val modelParams = getModelParameters.setNParallel(getBatchSize) + val model: LlamaModel = getModelIfNotSet.getSession(modelParams) + + val (prompts, base64EncodedImages) = batchedAnnotations.unzip match { + case (promptAnnotations, imageAnnotations) => + ( + promptAnnotations.map(_.result).toArray, + imageAnnotations + .map(imgAnno => ImageIOUtils.encodeImageBase64(imgAnno.result)) + .toArray) + } + + val (completedTexts: Array[String], metadata: Map[String, String]) = + try { + ( + model.requestBatchImageCompletion( + prompts, + base64EncodedImages, + getInferenceParameters), + Map.empty) + } catch { + case e: LlamaException => + logger.error("Error in llama.cpp image batch completion", e) + (Array[String](), Map("LlamaException" -> e.getMessage)) + } + + val result: Seq[Seq[Annotation]] = + batchedAnnotations.zip(completedTexts).map { + case ((textAnnotation: Annotation, imageAnnotation: AnnotationImage), text) => + val totalMetadata = + textAnnotation.metadata ++ imageAnnotation.metadata ++ metadata + Seq(new Annotation(outputAnnotatorType, 0, text.length - 1, text, totalMetadata)) + } + result + } else Seq(Seq.empty[Annotation]) + } +} + +trait ReadablePretrainedAutoGGUFVisionModel + extends ParamsAndFeaturesReadable[AutoGGUFVisionModel] + with HasPretrained[AutoGGUFVisionModel] { + override val defaultModelName: Some[String] = Some("llava_v1.5_7b_Q4_0_gguf") + override val defaultLang: String = "en" + + /** Java compliant-overrides */ + override def pretrained(): AutoGGUFVisionModel = super.pretrained() + + override def pretrained(name: String): AutoGGUFVisionModel = super.pretrained(name) + + override def pretrained(name: String, lang: String): AutoGGUFVisionModel = + super.pretrained(name, lang) + + override def pretrained(name: String, lang: String, remoteLoc: String): AutoGGUFVisionModel = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadAutoGGUFVisionModel { + this: ParamsAndFeaturesReadable[AutoGGUFVisionModel] => + + def readModel(instance: AutoGGUFVisionModel, path: String, spark: SparkSession): Unit = { + val model: GGUFWrapperMultiModal = GGUFWrapperMultiModal.readModel(path, spark) + + instance.setModelIfNotSet(spark, model) + } + + addReader(readModel) + + def loadSavedModel( + modelPath: String, + mmprojPath: String, + spark: SparkSession): AutoGGUFVisionModel = { + // TODO potentially enable download from HF-URLS + val localPathModel: String = ResourceHelper.copyToLocal(modelPath) + val localPathMmproj: String = ResourceHelper.copyToLocal(mmprojPath) + + val annotatorModel = new AutoGGUFVisionModel() + val wrapper = GGUFWrapperMultiModal.read(spark, localPathModel, localPathMmproj) + + annotatorModel + .setModelIfNotSet(spark, wrapper) + .setEngine(LlamaCPP.name) + + // TODO mmproj metadata necessary? + val metadata = LlamaModel.getMetadataFromFile(localPathModel) + if (metadata.nonEmpty) annotatorModel.setMetadata(metadata) + annotatorModel + } +} + +/** This is the companion object of [[AutoGGUFVisionModel]]. Please refer to that class for the + * documentation. + */ +object AutoGGUFVisionModel + extends ReadablePretrainedAutoGGUFVisionModel + with ReadAutoGGUFVisionModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 3a6e69b79c5cd1..1f17c8d711ea0c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -691,12 +691,13 @@ object PythonResourceDownloader { "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification, "BertForMultipleChoice" -> BertForMultipleChoice, "PromptAssembler" -> PromptAssembler, - "CPMTransformer"-> CPMTransformer, + "CPMTransformer" -> CPMTransformer, "NomicEmbeddings" -> NomicEmbeddings, "NLLBTransformer" -> NLLBTransformer, "Phi3Transformer" -> Phi3Transformer, "QwenTransformer" -> QwenTransformer, - "AutoGGUFEmbeddings" -> AutoGGUFEmbeddings) + "AutoGGUFEmbeddings" -> AutoGGUFEmbeddings, + "AutoGGUFVisionModel" -> AutoGGUFVisionModel) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTestSpec.scala new file mode 100644 index 00000000000000..961e2fc49b4488 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTestSpec.scala @@ -0,0 +1,121 @@ +package com.johnsnowlabs.nlp.annotators.seq2seq + +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.nlp.{Annotation, AnnotationImage, ImageAssembler} +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{DataFrame, Row} +import org.scalatest.flatspec.AnyFlatSpec + +import scala.collection.mutable + +class AutoGGUFVisionModelTestSpec extends AnyFlatSpec { + + behavior of "AutoGGUFVisionModel" + + lazy val documentAssembler = new DocumentAssembler() + .setInputCol("caption") + .setOutputCol("caption_document") + + lazy val imageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + + lazy val imagesPath = "src/test/resources/image/" + lazy val data: DataFrame = ImageAssembler + .loadImagesAsBytes(ResourceHelper.spark, imagesPath) + .withColumn("caption", lit("Caption this image.")) // Add a caption to each image. + + lazy val expectedWords: Map[String, String] = Map( + "bluetick.jpg" -> "dog", + "chihuahua.jpg" -> "dog", + "egyptian_cat.jpeg" -> "cat", + "hen.JPEG" -> "chick", + "hippopotamus.JPEG" -> "hippo", + "junco.JPEG" -> "bird", + "ostrich.JPEG" -> "ostrich", + "ox.JPEG" -> "bull", + "palace.JPEG" -> "room", + "tractor.JPEG" -> "tractor") + + lazy val nPredict = 40 + lazy val model = AutoGGUFVisionModel + .pretrained() + .setInputCols("caption_document", "image_assembler") + .setOutputCol("completions") + .setChatTemplate("vicuna") // llava uses vicuna as default + .setBatchSize(2) + .setNGpuLayers(99) + .setNCtx(4096) + .setMinKeep(0) + .setMinP(0.05f) + .setNPredict(nPredict) + .setNProbs(0) + .setPenalizeNl(false) + .setRepeatLastN(256) + .setRepeatPenalty(1.18f) + .setStopStrings(Array("", "Llama:", "User:")) + .setTemperature(0.05f) + .setTfsZ(1) + .setTypicalP(1) + .setTopK(40) + .setTopP(0.95f) + + lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, imageAssembler, model)) + + def checkBinaryContents(): Unit = { + val imageData = data.select("image.data").limit(1).collect()(0).getAs[Array[Byte]](0) + val byteContent = data.select("content").limit(1).collect()(0).getAs[Array[Byte]](0) + + assert(imageData.length == byteContent.length) + assert(imageData sameElements byteContent) + } + + it should "replace image data with bytes" taggedAs SlowTest in { + checkBinaryContents() + } + + it should "caption the images correctly" taggedAs SlowTest in { + import java.lang.management.ManagementFactory + val pid = ManagementFactory.getRuntimeMXBean.getName.split("@")(0) + println(s"Current PID: $pid") + + val result = pipeline.fit(data).transform(data.repartition(1)) + + val imageWithCompletions: Array[(AnnotationImage, Annotation)] = + result.select("image_assembler", "completions").collect().map { row => + val image = AnnotationImage(row.getAs[mutable.WrappedArray[Row]](0).head) + val annotation = Annotation(row.getAs[mutable.WrappedArray[Row]](1).head) + (image, annotation) + } + + imageWithCompletions.foreach { case (image, completion) => + val fileName = image.origin.split("/").last + val expectedWord = expectedWords(fileName) + val wordFound = completion.result.contains(expectedWord) + assert(wordFound, s"Expected word $expectedWord not found in $result") + } + } + + it should "be serializable" taggedAs SlowTest in { + val pipelineModel = pipeline.fit(data) + val savePath = "./tmp_autogguf_vision_model" + pipelineModel.stages.last + .asInstanceOf[AutoGGUFVisionModel] + .write + .overwrite() + .save(savePath) + + val loadedModel = AutoGGUFVisionModel.load(savePath) + val newPipeline: Pipeline = + new Pipeline().setStages(Array(documentAssembler, imageAssembler, loadedModel)) + + newPipeline + .fit(data) + .transform(data.limit(1)) + .select("completions") + .show(truncate = false) + } +}