Skip to content

Commit

Permalink
response wav instead of pcm
Browse files Browse the repository at this point in the history
  • Loading branch information
Yosshi999 committed Mar 4, 2025
1 parent 2e6e364 commit f3a779a
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 17 deletions.
35 changes: 28 additions & 7 deletions voicevox_engine/app/routers/tts_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,12 +386,12 @@ def multi_synthesis(
responses={
200: {
"content": {
"application/octet-stream": {"schema": {"type": "string", "format": "binary"}}
"audio/wav": {"schema": {"type": "string", "format": "binary"}}
},
}
},
tags=["音声合成"],
summary="ストリーミングで音声合成し、24kHz-モノラル-リトルエンディアン-符号付き16bitのlinear PCMバイナリを返す",
summary="ストリーミングで音声合成し、wavバイナリを逐次的に返す。24kHzモノラルのみ対応",
)
def stream_synthesis(
query: AudioQuery,
Expand All @@ -416,14 +416,35 @@ def stream_synthesis(
)
version = core_version or LATEST_VERSION
engine = tts_engines.get_engine(version)
wave_generator = engine.synthesize_wave_stream(
frame_length, wave_generator = engine.synthesize_wave_stream(
query, style_id, enable_interrogative_upspeak=enable_interrogative_upspeak
)
def generate_pcm(wave_generator):
def generate_wav():
data_size = frame_length * 2
file_size = data_size + 44
channel_size = 2 if query.outputStereo else 1
block_size = 16 * channel_size // 8
block_rate = query.outputSamplingRate * block_size
# yield wav header, fmt chunk, and data chunk header
yield (
b"RIFF"
+ (file_size - 8).to_bytes(4, "little")
+ b"WAVEfmt "
+ (16).to_bytes(4, "little") # fmt header length
+ (1).to_bytes(2, "little") # PCM
+ channel_size.to_bytes(2, "little")
+ query.outputSamplingRate.to_bytes(4, "little")
+ block_rate.to_bytes(4, "little")
+ block_size.to_bytes(2, "little")
+ (16).to_bytes(2, "little") # bit depth
+ b"data"
+ data_size.to_bytes(4, "little")
)
# yield data chunk body
for wave in wave_generator:
wave = (wave.clip(-1, 1) * 32767).astype('<i2')
yield wave.tobytes()
return StreamingResponse(generate_pcm(wave_generator), media_type="application/octet-stream")
pcm = (wave.clip(-1, 1) * 32767).astype('<i2')
yield pcm.tobytes()
return StreamingResponse(generate_wav(), media_type="audio/wav")

@router.post(
"/sing_frame_audio_query",
Expand Down
23 changes: 13 additions & 10 deletions voicevox_engine/tts_pipeline/tts_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,8 @@ def synthesize_wave_stream(
query: AudioQuery,
style_id: StyleId,
enable_interrogative_upspeak: bool = True,
) -> Generator[NDArray[np.float32], None, None]:
) -> tuple[int, Generator[NDArray[np.float32], None, None]]:
"""生成音声全体のフレーム数と音声波形を生成する同期ストリームを返す"""
valid_chunk_size = 94 # 1sec * 24000Hz / 256frame
query = copy.deepcopy(query)
query.accent_phrases = _apply_interrogative_upspeak(
Expand All @@ -606,15 +607,17 @@ def synthesize_wave_stream(

phoneme, f0 = _query_to_decoder_feature(query)
audio_feature = self._core.safe_generate_full_intermediate(phoneme, f0, style_id)
for render_start in range(0, len(audio_feature), valid_chunk_size):
render_end = min(render_start + valid_chunk_size, len(audio_feature))
slice_start = render_start
slice_end = render_end + 2 * self._core.margin_width
feature_segment = audio_feature[slice_start:slice_end, :]
raw_wave_with_margin, sr_raw_wave = self._core.safe_render_audio_segment(feature_segment, style_id)
raw_wave = raw_wave_with_margin[self._core.margin_width * 256:-self._core.margin_width * 256]
wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)
yield wave
def wave_generator():
for render_start in range(0, len(audio_feature), valid_chunk_size):
render_end = min(render_start + valid_chunk_size, len(audio_feature))
slice_start = render_start
slice_end = render_end + 2 * self._core.margin_width
feature_segment = audio_feature[slice_start:slice_end, :]
raw_wave_with_margin, sr_raw_wave = self._core.safe_render_audio_segment(feature_segment, style_id)
raw_wave = raw_wave_with_margin[self._core.margin_width * 256:-self._core.margin_width * 256]
wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)
yield wave
return len(audio_feature) * 256, wave_generator()

def initialize_synthesis(self, style_id: StyleId, skip_reinit: bool) -> None:
"""指定されたスタイル ID に関する合成機能を初期化する。既に初期化されていた場合は引数に応じて再初期化する。"""
Expand Down

0 comments on commit f3a779a

Please sign in to comment.