Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix PyTriton library discovery #505

Merged
merged 3 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -275,20 +275,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
" conf.set(\"spark.executorEnv.HF_HOME\", hf_home)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,12 +244,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
" conf.set(\"spark.executorEnv.HF_HOME\", hf_home)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,20 +340,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,12 +307,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
" conf.set(\"spark.executorEnv.HF_HOME\", hf_home)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -894,12 +894,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -862,12 +862,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\") \n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
" conf.set(\"spark.executor.cores\", \"8\")\n",
Expand Down
53 changes: 45 additions & 8 deletions examples/ML+DL-Examples/Spark-DL/dl_inference/pytriton_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
import signal
import socket
import sys
import time
from multiprocessing import Process
from typing import Callable, Dict, List, Optional, Tuple
Expand All @@ -42,8 +43,31 @@ def _start_triton_server(
model_path: Optional[str] = None,
) -> List[tuple]:
"""Task to start Triton server process on a Spark executor."""
sig = inspect.signature(triton_server_fn)
params = sig.parameters

def _prepare_pytriton_env():
"""Expose PyTriton to correct libpython3.11.so and Triton bundled libraries."""
ld_library_paths = []

# Add nvidia_pytriton.libs to LD_LIBRARY_PATH
for path in sys.path:
if os.path.isdir(path) and "site-packages" in path:
libs_path = os.path.join(path, "nvidia_pytriton.libs")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't be needed and seems like a bug in the py triton python package

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For libpython3.*.so at least they do instruct to add to path in conda envs per docs. But yeah linking pytriton libs seems like it shouldn't be necessary; will ping on this.

if os.path.isdir(libs_path):
ld_library_paths.append(libs_path)
break

# Add ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH for conda environments
if os.path.exists(os.path.join(sys.prefix, "conda-meta")):
conda_lib = os.path.join(sys.prefix, "lib")
if os.path.isdir(conda_lib):
ld_library_paths.append(conda_lib)

if "LD_LIBRARY_PATH" in os.environ:
ld_library_paths.append(os.environ["LD_LIBRARY_PATH"])

os.environ["LD_LIBRARY_PATH"] = ":".join(ld_library_paths)

return None

def _find_ports(start_port: int = 7000) -> List[int]:
"""Find available ports for Triton's HTTP, gRPC, and metrics services."""
Expand All @@ -59,6 +83,8 @@ def _find_ports(start_port: int = 7000) -> List[int]:
return ports

ports = _find_ports()
sig = inspect.signature(triton_server_fn)
params = sig.parameters

if model_path is not None:
assert (
Expand All @@ -69,6 +95,7 @@ def _find_ports(start_port: int = 7000) -> List[int]:
assert len(params) == 1, "Server function must accept (ports) argument"
args = (ports,)

_prepare_pytriton_env()
hostname = socket.gethostname()
process = Process(target=triton_server_fn, args=args)
process.start()
Expand All @@ -83,6 +110,11 @@ def _find_ports(start_port: int = 7000) -> List[int]:
except Exception:
pass

client.close()
if process.is_alive():
# Terminate if timeout is exceeded to avoid dangling server processes
process.terminate()

raise TimeoutError(
"Failure: server startup timeout exceeded. Check the executor logs for more info."
)
Expand All @@ -98,14 +130,19 @@ def _stop_triton_server(
pid, _ = server_pids_ports.get(hostname)
assert pid is not None, f"No server PID found for host {hostname}"

for _ in range(wait_retries):
try:
process = psutil.Process(pid)
process.terminate()
process.wait(timeout=wait_timeout * wait_retries)
return [True]
except psutil.NoSuchProcess:
return [True]
except psutil.TimeoutExpired:
try:
os.kill(pid, signal.SIGTERM)
except OSError:
process.kill()
return [True]
time.sleep(wait_timeout)

return [False] # Failed to terminate or timed out
except:
return [False]


class TritonServerManager:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -815,20 +815,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -991,20 +991,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,20 +189,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\") \n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1210,20 +1210,7 @@
" conf.setMaster(f\"spark://{hostname}:7077\")\n",
" conf.set(\"spark.pyspark.python\", f\"{conda_env}/bin/python\")\n",
" conf.set(\"spark.pyspark.driver.python\", f\"{conda_env}/bin/python\")\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_env}/lib:{conda_env}/lib/python3.11/site-packages/nvidia_pytriton.libs:$LD_LIBRARY_PATH\")\n",
" source = \"/usr/lib/x86_64-linux-gnu/libstdc++.so.6\"\n",
" target = f\"{conda_env}/lib/libstdc++.so.6\"\n",
" try:\n",
" if os.path.islink(target) or os.path.exists(target):\n",
" os.remove(target)\n",
" os.symlink(source, target)\n",
" except OSError as e:\n",
" print(f\"Error creating symlink: {e}\")\n",
" elif on_dataproc:\n",
" # Point PyTriton to correct libpython3.11.so:\n",
" conda_lib_path=\"/opt/conda/miniconda3/lib\"\n",
" conf.set(\"spark.executorEnv.LD_LIBRARY_PATH\", f\"{conda_lib_path}:$LD_LIBRARY_PATH\")\n",
" conf.set(\"spark.executorEnv.TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" conf.set(\"spark.executor.instances\", \"4\") # dataproc defaults to 2\n",
"\n",
Expand Down