From 24b31ccbb5b5812fe1bb4110e62392d9d4b9f64f Mon Sep 17 00:00:00 2001
From: remsky <jeremy.braun@ucalgary.ca>
Date: Tue, 11 Feb 2025 04:49:48 -0700
Subject: [PATCH] -Fixed espeak engagement on gpu -Add default voice code
 setting and update language code resolution logic

---
 api/src/core/config.py         |  1 +
 api/src/inference/kokoro_v1.py | 14 ++++++++++----
 docker/gpu/Dockerfile          | 27 +++++++++++++++------------
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/api/src/core/config.py b/api/src/core/config.py
index d2e369b..26b4178 100644
--- a/api/src/core/config.py
+++ b/api/src/core/config.py
@@ -13,6 +13,7 @@ class Settings(BaseSettings):
     output_dir: str = "output"
     output_dir_size_limit_mb: float = 500.0  # Maximum size of output directory in MB
     default_voice: str = "af_heart"
+    default_voice_code: str | None = None  # If set, overrides the first letter of voice name, though api call param still takes precedence
     use_gpu: bool = True  # Whether to use GPU acceleration if available
     allow_local_voice_saving: bool = (
         False  # Whether to allow saving combined voices locally
diff --git a/api/src/inference/kokoro_v1.py b/api/src/inference/kokoro_v1.py
index 9f5e206..28ad272 100644
--- a/api/src/inference/kokoro_v1.py
+++ b/api/src/inference/kokoro_v1.py
@@ -139,8 +139,14 @@ async def generate_from_tokens(
             await paths.save_voice_tensor(voice_tensor, temp_path)
             voice_path = temp_path
 
-            # Use provided lang_code or get from voice name
-            pipeline_lang_code = lang_code if lang_code else voice_name[0].lower()
+            # Use provided lang_code, settings voice code override, or first letter of voice name
+            if lang_code: # api is given priority
+                pipeline_lang_code = lang_code
+            elif settings.default_voice_code: # settings is next priority
+                pipeline_lang_code = settings.default_voice_code
+            else: # voice name is default/fallback
+                pipeline_lang_code = voice_name[0].lower()
+
             pipeline = self._get_pipeline(pipeline_lang_code)
 
             logger.debug(
@@ -232,8 +238,8 @@ async def generate(
             await paths.save_voice_tensor(voice_tensor, temp_path)
             voice_path = temp_path
 
-            # Use provided lang_code or get from voice name
-            pipeline_lang_code = lang_code if lang_code else voice_name[0].lower()
+            # Use provided lang_code, settings voice code override, or first letter of voice name
+            pipeline_lang_code = lang_code if lang_code else (settings.default_voice_code if settings.default_voice_code else voice_name[0].lower())
             pipeline = self._get_pipeline(pipeline_lang_code)
 
             logger.debug(
diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile
index ce0f646..ea9b339 100644
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@@ -12,6 +12,7 @@ RUN apt-get update && apt-get install -y \
     libsndfile1 \
     curl \
     ffmpeg \
+    g++ \
  && apt-get clean && rm -rf /var/lib/apt/lists/* \
  && mkdir -p /usr/share/espeak-ng-data \
  && ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
@@ -19,40 +20,42 @@ RUN apt-get update && apt-get install -y \
 # Install UV using the installer script
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
     mv /root/.local/bin/uv /usr/local/bin/ && \
-    mv /root/.local/bin/uvx /usr/local/bin/ && \
-    useradd -m -u 1000 appuser && \
-    mkdir -p /app/api/src/models/v1_0 && \
-    chown -R appuser:appuser /app 
+    mv /root/.local/bin/uvx /usr/local/bin/ 
 
+# Create non-root user and set up directories and permissions
+RUN useradd -m -u 1000 appuser && \
+    mkdir -p /app/api/src/models/v1_0 && \
+    chown -R appuser:appuser /app
+    
 USER appuser
 WORKDIR /app
 
 # Copy dependency files
 COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
 
+ENV PHONEMIZER_ESPEAK_PATH=/usr/bin \
+    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
+    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
+
 # Install dependencies with GPU extras (using cache mounts)
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv venv && \
     uv sync --extra gpu
 
-# Copy project files including models and sync again
+# Copy project files including models
 COPY --chown=appuser:appuser api ./api
 COPY --chown=appuser:appuser web ./web
 COPY --chown=appuser:appuser docker/scripts/ ./
 RUN chmod +x ./entrypoint.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv sync --extra gpu
+
 
 # Set all environment variables in one go
 ENV PYTHONUNBUFFERED=1 \
     PYTHONPATH=/app:/app/api \
     PATH="/app/.venv/bin:$PATH" \
     UV_LINK_MODE=copy \
-    USE_GPU=true \
-    PHONEMIZER_ESPEAK_PATH=/usr/bin \
-    PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
-    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
-    
+    USE_GPU=true 
+
 ENV DOWNLOAD_MODEL=true
 # Download model if enabled
 RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \