Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"error": "Unable to parse 'inputs': attempt to access non-existing object member 'inputs'" #683

Open
adityarap opened this issue Jan 28, 2025 · 1 comment

Comments

@adityarap
Copy link

I have deployed Llama-3-8B model compiled with TRT-LLM backend with all the default parameters and used inflight-batcher to create model repository. Was able to serve and get response on local deployment.

Then deployed the same model repo. on vertex AI, and when tested a sample payload as below:

{
  "instances": [
    {
      "name": "input_ids",
      "shape": [1, 8],
      "datatype": "INT32",
      "data": [101, 2054, 2003, 14227, 11107, 10899, 3941, 102]
    },
    {
      "name": "input_lengths",
      "shape": [1, 1],
      "datatype": "INT32",
      "data": [8]
    },
    {
      "name": "request_output_len",
      "shape": [1, 1],
      "datatype": "INT32",
      "data": [50]
    }
  ]
}

It throws error as :

{
 "error": "Unable to parse 'inputs': attempt to access non-existing object member 'inputs'"
}

Below is my config.pbxt :

name: "ensemble"
platform: "ensemble"
max_batch_size: 256
input [
  {
    name: "text_input"
    data_type: TYPE_STRING
    dims: [ 1 ]
  },
  {
    name: "decoder_text_input"
    data_type: TYPE_STRING
    dims: [ 1 ]
    optional: true
  },
  {
    name: "max_tokens"
    data_type: TYPE_INT32
    dims: [ 1 ]
  },
  {
    name: "num_return_sequences"
    data_type: TYPE_INT32
    dims: [ 1 ]
    optional: true
  },
  {
   name: "bad_words"
   data_type: TYPE_STRING
   dims: [ -1 ]
   optional: true
  },
  {
   name: "stop_words"
   data_type: TYPE_STRING
   dims: [ -1 ]
   optional: true
  },
  {
    name: "exclude_input_in_output"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  },
  {
    name: "end_id"
    data_type: TYPE_INT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "pad_id"
    data_type: TYPE_INT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "top_k"
    data_type: TYPE_INT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "top_p"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "temperature"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "length_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "repetition_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "min_length"
    data_type: TYPE_INT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "presence_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "frequency_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "random_seed"
    data_type: TYPE_UINT64
    dims: [ 1 ]
    optional: true
  },
  {
    name: "return_log_probs"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  },
  {
    name: "return_context_logits"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  },
  {
    name: "return_generation_logits"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  },
  {
    name: "return_kv_cache_reuse_stats"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  },
  {
    name: "beam_width"
    data_type: TYPE_INT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "stream"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  },
  {
    name: "prompt_embedding_table"
    data_type: TYPE_FP16
    dims: [ -1, -1 ]
    optional: true
  },
  {
    name: "prompt_table_extra_id"
    data_type: TYPE_UINT64
    dims: [ 1 ]
    optional: true
  },
  {
    name: "prompt_vocab_size"
    data_type: TYPE_INT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "embedding_bias_words"
    data_type: TYPE_STRING
    dims: [ -1 ]
    optional: true
  },
  {
    name: "embedding_bias_weights"
    data_type: TYPE_FP32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "lora_task_id"
    data_type: TYPE_UINT64
    dims: [ 1 ]
    optional: true
  },
  {
    name: "lora_weights"
    data_type: TYPE_FP16
    dims: [ -1, -1 ]
    optional: true
    allow_ragged_batch: true
  },
  {
    name: "lora_config"
    data_type: TYPE_INT32
    dims: [ -1, 3 ]
    optional: true
    allow_ragged_batch: true
  }
]
output [
  {
    name: "text_output"
    data_type: TYPE_STRING
    dims: [ -1 ]
  },
  {
    name: "cum_log_probs"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "output_log_probs"
    data_type: TYPE_FP32
    dims: [ -1, -1 ]
  },
  {
    name: "context_logits"
    data_type: TYPE_FP32
    dims: [ -1, -1 ]
  },
  {
    name: "generation_logits"
    data_type: TYPE_FP32
    dims: [ -1, -1, -1 ]
  },
  {
    name: "batch_index"
    data_type: TYPE_INT32
    dims: [ 1 ]
  },
  {
    name: "sequence_index"
    data_type: TYPE_INT32
    dims: [ 1 ]
  },
  {
    name: "kv_cache_alloc_new_blocks"
    data_type: TYPE_INT32
    dims: [ 1 ]
  },
  {
    name: "kv_cache_reused_blocks"
    data_type: TYPE_INT32
    dims: [ 1 ]
  },
  {
    name: "kv_cache_alloc_total_blocks"
    data_type: TYPE_INT32
    dims: [ 1 ]
  }
]
ensemble_scheduling {
  step [
    {
      model_name: "preprocessing"
      model_version: -1
      input_map {
        key: "QUERY"
        value: "text_input"
      }
      input_map {
        key: "DECODER_QUERY"
        value: "decoder_text_input"
      }
      input_map {
        key: "REQUEST_OUTPUT_LEN"
        value: "max_tokens"
      }
      input_map {
        key: "BAD_WORDS_DICT"
        value: "bad_words"
      }
      input_map {
        key: "STOP_WORDS_DICT"
        value: "stop_words"
      }
      input_map {
        key: "EMBEDDING_BIAS_WORDS"
        value: "embedding_bias_words"
      }
      input_map {
        key: "EMBEDDING_BIAS_WEIGHTS"
        value: "embedding_bias_weights"
      }
      input_map {
        key: "END_ID"
        value: "end_id"
      }
      input_map {
        key: "PAD_ID"
        value: "pad_id"
      }
      input_map {
        key: "PROMPT_TABLE_EXTRA_ID"
        value: "prompt_table_extra_id"
      }
      output_map {
        key: "REQUEST_INPUT_LEN"
        value: "_REQUEST_INPUT_LEN"
      }
      output_map {
        key: "INPUT_ID"
        value: "_INPUT_ID"
      }
      output_map {
        key: "REQUEST_DECODER_INPUT_LEN"
        value: "_REQUEST_DECODER_INPUT_LEN"
      }
      output_map {
        key: "DECODER_INPUT_ID"
        value: "_DECODER_INPUT_ID"
      }
      output_map {
        key: "REQUEST_OUTPUT_LEN"
        value: "_REQUEST_OUTPUT_LEN"
      }
      output_map {
        key: "STOP_WORDS_IDS"
        value: "_STOP_WORDS_IDS"
      }
      output_map {
        key: "BAD_WORDS_IDS"
        value: "_BAD_WORDS_IDS"
      }
      output_map {
        key: "EMBEDDING_BIAS"
        value: "_EMBEDDING_BIAS"
      }
      output_map {
        key: "OUT_END_ID"
        value: "_PREPROCESSOR_END_ID"
      }
      output_map {
        key: "OUT_PAD_ID"
        value: "_PREPROCESSOR_PAD_ID"
      }
      output_map {
        key: "OUT_PROMPT_TABLE_EXTRA_IDS"
        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
      }
    },
    {
      model_name: "tensorrt_llm"
      model_version: -1
      input_map {
        key: "input_ids"
        value: "_INPUT_ID"
      }
      input_map {
        key: "decoder_input_ids"
        value: "_DECODER_INPUT_ID"
      }
      input_map {
        key: "input_lengths"
        value: "_REQUEST_INPUT_LEN"
      }
      input_map {
        key: "decoder_input_lengths"
        value: "_REQUEST_DECODER_INPUT_LEN"
      }
      input_map {
        key: "exclude_input_in_output"
        value: "exclude_input_in_output"
      }
      input_map {
        key: "request_output_len"
        value: "_REQUEST_OUTPUT_LEN"
      }
      input_map {
          key: "end_id"
          value: "_PREPROCESSOR_END_ID"
      }
      input_map {
          key: "pad_id"
          value: "_PREPROCESSOR_PAD_ID"
      }
      input_map {
          key: "embedding_bias"
          value: "_EMBEDDING_BIAS"
      }
      input_map {
          key: "runtime_top_k"
          value: "top_k"
      }
      input_map {
          key: "runtime_top_p"
          value: "top_p"
      }
      input_map {
          key: "temperature"
          value: "temperature"
      }
      input_map {
          key: "len_penalty"
          value: "length_penalty"
      }
      input_map {
          key: "repetition_penalty"
          value: "repetition_penalty"
      }
      input_map {
          key: "min_length"
          value: "min_length"
      }
      input_map {
          key: "presence_penalty"
          value: "presence_penalty"
      }
      input_map {
          key: "frequency_penalty"
          value: "frequency_penalty"
      }
      input_map {
          key: "random_seed"
          value: "random_seed"
      }
      input_map {
          key: "return_log_probs"
          value: "return_log_probs"
      }
      input_map {
          key: "return_context_logits"
          value: "return_context_logits"
      }
      input_map {
          key: "return_generation_logits"
          value: "return_generation_logits"
      }
      input_map {
          key: "return_kv_cache_reuse_stats"
          value: "return_kv_cache_reuse_stats"
      }
      input_map {
          key: "num_return_sequences"
          value: "num_return_sequences"
      }
      input_map {
          key: "beam_width"
          value: "beam_width"
      }
      input_map {
          key: "streaming"
          value: "stream"
      }
      input_map {
        key: "prompt_embedding_table"
        value: "prompt_embedding_table"
      }
      input_map {
        key: "prompt_vocab_size"
        value: "prompt_vocab_size"
      }
      input_map {
        key: "stop_words_list"
        value: "_STOP_WORDS_IDS"
      }
      input_map {
        key: "bad_words_list"
        value: "_BAD_WORDS_IDS"
      }
      input_map {
        key: "prompt_table_extra_ids"
        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
      },
      input_map {
        key: "lora_task_id",
        value: "lora_task_id"
      },
      input_map {
        key: "lora_weights",
        value: "lora_weights"
      },
      input_map {
        key: "lora_config",
        value: "lora_config"
      },
      output_map {
        key: "output_ids"
        value: "_TOKENS_BATCH"
      }
      output_map {
        key: "sequence_length"
        value: "_SEQUENCE_LENGTH"
      },
      output_map {
        key: "cum_log_probs"
        value: "cum_log_probs"
      }
      output_map {
        key: "output_log_probs"
        value: "output_log_probs"
      },
      output_map {
        key: "context_logits"
        value: "context_logits"
      },
      output_map {
        key: "generation_logits"
        value: "generation_logits"
      },
      output_map {
        key: "batch_index"
        value: "batch_index"
      },
      output_map {
        key: "sequence_index"
        value: "sequence_index"
      },
      output_map {
        key: "kv_cache_alloc_new_blocks"
        value: "kv_cache_alloc_new_blocks"
      },
      output_map {
        key: "kv_cache_reused_blocks"
        value: "kv_cache_reused_blocks"
      },
      output_map {
        key: "kv_cache_alloc_total_blocks"
        value: "kv_cache_alloc_total_blocks"
      }
    },
    {
      model_name: "postprocessing"
      model_version: -1
      input_map {
        key: "TOKENS_BATCH"
        value: "_TOKENS_BATCH"
      }
      input_map {
        key: "SEQUENCE_LENGTH"
        value: "_SEQUENCE_LENGTH"
      }
      output_map {
        key: "OUTPUT"
        value: "text_output"
      }
    }
  ]
}
@adityarap
Copy link
Author

FYI : I have used custom container image as below

# Use the official Triton Inference Server image with TensorRT-LLM support
FROM nvcr.io/nvidia/tritonserver:24.12-trtllm-python-py3

# Set the working directory inside the container
WORKDIR /app

# Install necessary dependencies
RUN apt-get update && \
    apt-get upgrade -y

# Install Google Cloud SDK for gsutil
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | tee /etc/apt/trusted.gpg.d/google.asc
RUN echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee /etc/apt/sources.list.d/google-cloud-sdk.list
RUN apt-get update && apt-get install -y google-cloud-sdk
RUN rm -rf /usr/lib/google-cloud-sdk/bin/anthoscli

# Expose Triton's default HTTP port
EXPOSE 8000 8080

# Set environment variables for runtime configuration
ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1

# Now you can access your GCS bucket without needing a service account key
RUN gsutil -m cp -r gs://llama-model-bucket/Meta-Llama-3-8B-Instruct /app/ && \
        gsutil -m cp -r gs://llama-model-bucket/triton_model_repo /app/

# Run Triton Inference Server on container start
ENTRYPOINT ["mpirun", "-n", "1"]
CMD [ "tritonserver", "--model-repository=/app/triton_model_repo/", "--vertex-ai-default-model=ensemble"]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant