Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement endpoint: stream_synthesis #1542

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 70 additions & 1 deletion voicevox_engine/app/routers/tts_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""音声合成機能を提供する API Router"""

import zipfile
from collections.abc import Generator
from tempfile import NamedTemporaryFile, TemporaryFile
from typing import Annotated, Self

Expand All @@ -9,7 +10,7 @@
from pydantic import BaseModel, Field
from pydantic.json_schema import SkipJsonSchema
from starlette.background import BackgroundTask
from starlette.responses import FileResponse
from starlette.responses import FileResponse, StreamingResponse

from voicevox_engine.cancellable_engine import (
CancellableEngine,
Expand Down Expand Up @@ -380,6 +381,74 @@ def multi_synthesis(
background=BackgroundTask(try_delete_file, f.name),
)

@router.post(
"/stream_synthesis",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

あ、どういうシグネチャが良いかいろいろ考えてみたのがこちらにあるので参考にしていただければ!!
#1492 (comment)
(例えばpcmではなくwavで返すのが良さそう、などが実装と違いそう)

ただ仕様がこれでよいのかかなり自信がないので、突っ込みあればウェルカムです!!

response_class=StreamingResponse,
responses={
200: {
"content": {
"audio/wav": {"schema": {"type": "string", "format": "binary"}}
},
}
},
tags=["音声合成"],
summary="ストリーミングで音声合成し、wavバイナリを逐次的に返す。24kHzモノラルのみ対応",
)
def stream_synthesis(
query: AudioQuery,
style_id: Annotated[StyleId, Query(alias="speaker")],
enable_interrogative_upspeak: Annotated[
bool,
Query(
description="疑問系のテキストが与えられたら語尾を自動調整する",
),
] = True,
core_version: str | SkipJsonSchema[None] = None,
) -> StreamingResponse:
if query.outputSamplingRate != 24000:
raise HTTPException(
status_code=422,
detail="24kHz以外のサンプリングレートはサポートされていません",
)
if query.outputStereo:
raise HTTPException(
status_code=422,
detail="ステレオ出力はサポートされていません",
)
version = core_version or LATEST_VERSION
engine = tts_engines.get_engine(version)
frame_length, wave_generator = engine.synthesize_wave_stream(
query, style_id, enable_interrogative_upspeak=enable_interrogative_upspeak
)

def generate_wav() -> Generator[bytes, None, None]:
data_size = frame_length * 2
file_size = data_size + 44
channel_size = 2 if query.outputStereo else 1
block_size = 16 * channel_size // 8
block_rate = query.outputSamplingRate * block_size
# yield wav header, fmt chunk, and data chunk header
yield (
b"RIFF"
+ (file_size - 8).to_bytes(4, "little")
+ b"WAVEfmt "
+ (16).to_bytes(4, "little") # fmt header length
+ (1).to_bytes(2, "little") # PCM
+ channel_size.to_bytes(2, "little")
+ query.outputSamplingRate.to_bytes(4, "little")
+ block_rate.to_bytes(4, "little")
+ block_size.to_bytes(2, "little")
+ (16).to_bytes(2, "little") # bit depth
+ b"data"
+ data_size.to_bytes(4, "little")
)
# yield data chunk body
for wave in wave_generator:
pcm = (wave.clip(-1, 1) * 32767).astype("<i2")
yield pcm.tobytes()

return StreamingResponse(generate_wav(), media_type="audio/wav")

@router.post(
"/sing_frame_audio_query",
tags=["クエリ作成"],
Expand Down
36 changes: 36 additions & 0 deletions voicevox_engine/core/core_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ def __init__(self, core: CoreWrapper):
def default_sampling_rate(self) -> int:
return self.core.default_sampling_rate

@property
def margin_width(self) -> int:
return self.core.margin_width

@property
def characters(self) -> list[CoreCharacter]:
"""キャラクター情報"""
Expand Down Expand Up @@ -191,6 +195,38 @@ def safe_decode_forward(
sr_wave = self.default_sampling_rate
return wave, sr_wave

def safe_generate_full_intermediate(
self,
phoneme: NDArray[np.float32],
f0: NDArray[np.float32],
style_id: StyleId,
) -> NDArray[np.float32]:
self.initialize_style_id_synthesis(style_id, skip_reinit=True)
with self.mutex:
audio_feature = self.core.generate_full_intermediate(
length=phoneme.shape[0],
phoneme_size=phoneme.shape[1],
f0=f0[:, np.newaxis],
phoneme=phoneme,
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)
return audio_feature

def safe_render_audio_segment(
self,
audio_feature: NDArray[np.float32],
style_id: StyleId,
) -> tuple[NDArray[np.float32], int]:
self.initialize_style_id_synthesis(style_id, skip_reinit=True)
with self.mutex:
wave = self.core.render_audio_segment(
length=audio_feature.shape[0],
audio_feature=audio_feature,
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)
sr_wave = self.default_sampling_rate
return wave, sr_wave

def safe_predict_sing_consonant_length_forward(
self,
consonant: NDArray[np.int64],
Expand Down
78 changes: 78 additions & 0 deletions voicevox_engine/core/core_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,8 @@ def __init__(
load_all_models: bool = False,
) -> None:
self.default_sampling_rate = 24000
self.margin_width = 14
self.feature_dim = 80

self.core = load_core(core_dir, use_gpu)

Expand Down Expand Up @@ -731,6 +733,82 @@ def decode_forward(
)
return output

def generate_full_intermediate(
self,
length: int,
phoneme_size: int,
f0: NDArray[np.float32],
phoneme: NDArray[np.float32],
style_id: NDArray[np.int64],
) -> NDArray[np.float32]:
"""
フレームごとの音素と音高から音声特徴量を求める関数
Parameters
----------
length : int
フレームの長さ
phoneme_size : int
音素の種類数
f0 : NDArray[np.float32]
フレームごとの音高
phoneme : NDArray[np.float32]
フレームごとの音素
style_id : NDArray[np.int64]
スタイル番号
Returns
-------
output : NDArray[np.float32]
音声特徴量
"""
output = np.empty(
(length + 2 * self.margin_width, self.feature_dim), dtype=np.float32
)
self.assert_core_success(
self.core.generate_full_intermediate(
c_int(length),
c_int(phoneme_size),
f0.ctypes.data_as(POINTER(c_float)),
phoneme.ctypes.data_as(POINTER(c_float)),
style_id.ctypes.data_as(POINTER(c_long)),
output.ctypes.data_as(POINTER(c_float)),
)
)
return output

def render_audio_segment(
self,
length: int,
audio_feature: NDArray[np.float32],
style_id: NDArray[np.int64],
) -> NDArray[np.float32]:
"""
音声特徴量から音声波形を生成する関数
Parameters
----------
length : int
フレームの長さ
audio_feature : NDArray[np.float32]
音声特徴量
style_id : NDArray[np.int64]
スタイル番号
Returns
-------
output : NDArray[np.float32]
音声波形
"""
output = np.empty((length * 256,), dtype=np.float32)
self.assert_core_success(
self.core.render_audio_segment(
c_int(length),
c_int(self.margin_width),
c_int(self.feature_dim),
audio_feature.ctypes.data_as(POINTER(c_float)),
style_id.ctypes.data_as(POINTER(c_long)),
output.ctypes.data_as(POINTER(c_float)),
)
)
return output

def predict_sing_consonant_length_forward(
self,
length: int,
Expand Down
36 changes: 36 additions & 0 deletions voicevox_engine/tts_pipeline/tts_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import copy
import math
from collections.abc import Generator
from typing import Final, Literal, TypeAlias

import numpy as np
Expand Down Expand Up @@ -591,6 +592,41 @@ def synthesize_wave(
wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)
return wave

def synthesize_wave_stream(
self,
query: AudioQuery,
style_id: StyleId,
enable_interrogative_upspeak: bool = True,
) -> tuple[int, Generator[NDArray[np.float32], None, None]]:
"""生成音声全体のフレーム数と音声波形を生成する同期ストリームを返す"""
valid_chunk_size = 94 # 1sec * 24000Hz / 256frame
query = copy.deepcopy(query)
query.accent_phrases = _apply_interrogative_upspeak(
query.accent_phrases, enable_interrogative_upspeak
)

phoneme, f0 = _query_to_decoder_feature(query)
audio_feature = self._core.safe_generate_full_intermediate(
phoneme, f0, style_id
)

def wave_generator() -> Generator[NDArray[np.float32], None, None]:
for render_start in range(0, len(audio_feature), valid_chunk_size):
render_end = min(render_start + valid_chunk_size, len(audio_feature))
slice_start = render_start
slice_end = render_end + 2 * self._core.margin_width
feature_segment = audio_feature[slice_start:slice_end, :]
raw_wave_with_margin, sr_raw_wave = (
self._core.safe_render_audio_segment(feature_segment, style_id)
)
raw_wave = raw_wave_with_margin[
self._core.margin_width * 256 : -self._core.margin_width * 256
]
wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)
yield wave

return len(audio_feature) * 256, wave_generator()

def initialize_synthesis(self, style_id: StyleId, skip_reinit: bool) -> None:
"""指定されたスタイル ID に関する合成機能を初期化する。既に初期化されていた場合は引数に応じて再初期化する。"""
self._core.initialize_style_id_synthesis(style_id, skip_reinit=skip_reinit)
Expand Down
Loading