Skip to content

Commit

Permalink
Fix PyTriton library discovery (#505)
Browse files Browse the repository at this point in the history
Detect and configure required library paths (i.e., to Triton bundled
libraries and libpython3.*.so), when starting Triton process. Avoids the
need to set executorEnv globally and works across conda/non-conda envs
(e.g. Dataproc vs. Databricks).
Minor cleanups to PyTriton server shutdown.

---------

Signed-off-by: Rishi Chandra <rishic@nvidia.com>
  • Loading branch information
rishic3 authored Feb 27, 2025
1 parent 14e9bbc commit 5d52c1b
Show file tree
Hide file tree
Showing 15 changed files with 45 additions and 126 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -275,20 +275,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
" conf.set(\"spark.executorEnv.HF_HOME\", hf_home)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,12 +244,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
" conf.set(\"spark.executorEnv.HF_HOME\", hf_home)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,20 +340,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,12 +307,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
" conf.set(\"spark.executorEnv.HF_HOME\", hf_home)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -894,12 +894,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -862,12 +862,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\") \n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
53 changes: 45 additions & 8 deletions examples/ML+DL-Examples/Spark-DL/dl_inference/pytriton_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
import signal
import socket
import sys
import time
from multiprocessing import Process
from typing import Callable, Dict, List, Optional, Tuple
Expand All @@ -42,8 +43,31 @@ def _start_triton_server(
model_path: Optional[str] = None,
) -> List[tuple]:
"""Task to start Triton server process on a Spark executor."""
sig = inspect.signature(triton_server_fn)
params = sig.parameters

def _prepare_pytriton_env():
"""Expose PyTriton to correct libpython3.11.so and Triton bundled libraries."""
ld_library_paths = []

# Add nvidia_pytriton.libs to LD_LIBRARY_PATH
for path in sys.path:
if os.path.isdir(path) and "site-packages" in path:
libs_path = os.path.join(path, "nvidia_pytriton.libs")
if os.path.isdir(libs_path):
ld_library_paths.append(libs_path)
break

# Add ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH for conda environments
if os.path.exists(os.path.join(sys.prefix, "conda-meta")):
conda_lib = os.path.join(sys.prefix, "lib")
if os.path.isdir(conda_lib):
ld_library_paths.append(conda_lib)

if "LD_LIBRARY_PATH" in os.environ:
ld_library_paths.append(os.environ["LD_LIBRARY_PATH"])

os.environ["LD_LIBRARY_PATH"] = ":".join(ld_library_paths)

return None

def _find_ports(start_port: int = 7000) -> List[int]:
"""Find available ports for Triton's HTTP, gRPC, and metrics services."""
Expand All @@ -59,6 +83,8 @@ def _find_ports(start_port: int = 7000) -> List[int]:
return ports

ports = _find_ports()
sig = inspect.signature(triton_server_fn)
params = sig.parameters

if model_path is not None:
assert (
Expand All @@ -69,6 +95,7 @@ def _find_ports(start_port: int = 7000) -> List[int]:
assert len(params) == 1, "Server function must accept (ports) argument"
args = (ports,)

_prepare_pytriton_env()
hostname = socket.gethostname()
process = Process(target=triton_server_fn, args=args)
process.start()
Expand All @@ -83,6 +110,11 @@ def _find_ports(start_port: int = 7000) -> List[int]:
except Exception:
pass

client.close()
if process.is_alive():
# Terminate if timeout is exceeded to avoid dangling server processes
process.terminate()

raise TimeoutError(
"Failure: server startup timeout exceeded. Check the executor logs for more info."
)
Expand All @@ -98,14 +130,19 @@ def _stop_triton_server(
pid, _ = server_pids_ports.get(hostname)
assert pid is not None, f"No server PID found for host {hostname}"

for _ in range(wait_retries):
try:
process = psutil.Process(pid)
process.terminate()
process.wait(timeout=wait_timeout * wait_retries)
return [True]
except psutil.NoSuchProcess:
return [True]
except psutil.TimeoutExpired:
try:
os.kill(pid, signal.SIGTERM)
except OSError:
process.kill()
return [True]
time.sleep(wait_timeout)

return [False] # Failed to terminate or timed out
except:
return [False]


class TritonServerManager:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -815,20 +815,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -991,20 +991,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,20 +189,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\") \n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1210,20 +1210,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down

0 comments on commit 5d52c1b

Please sign in to comment.