ava-tts / app.py
mathiasvinicius's picture
fix: patch GPT.generate para remover kwargs de token deprecated (transformers>=4.45)
97f1d86
import sys
import types
# ── ZeroGPU startup patch ─────────────────────────────────────────────────────
# torchaudio compilado contra CUDA 13 trava ao importar quando nΓ£o hΓ‘ GPU.
# Injetamos um stub para torchaudio._extension em sys.modules ANTES de qualquer
# import do TTS/torchaudio. O Python usa o stub sem executar o __init__.py real,
# evitando toda a lΓ³gica de carregamento de extensΓ£o CUDA.
# O Γ‘udio cai automaticamente para o backend soundfile (puro Python, sem CUDA).
def _make_torchaudio_ext_stub(name):
"""Stub para torchaudio._extension β€” captura qualquer atributo faltante.
Problema raiz: torchaudio compilado contra CUDA 13 trava no ZeroGPU startup.
Injetamos este stub em sys.modules antes de qualquer import de torchaudio.
Cuidados:
- __file__ = None β†’ inspect.getfile() levanta TypeError (capturado por getmodule),
evitando que __getattr__ retorne uma lambda onde se espera uma string de path.
- __getattr__ levanta AttributeError para dunders β†’ Python os trata normalmente.
- Flags booleanas (_IS_*, _HAS_*) β†’ False.
- Qualquer outro atributo β†’ callable no-op (ex: fail_if_no_align).
"""
mod = types.ModuleType(name)
# Dunders de mΓ³dulo explΓ­citos para nΓ£o cair no __getattr__
mod.__file__ = None
mod.__spec__ = None
mod.__loader__ = None
mod.__package__ = name.rsplit(".", 1)[0] if "." in name else name
# Atributos conhecidos da extensΓ£o
mod._IS_TORCHAUDIO_EXT_AVAILABLE = False
mod._IS_SOX_AVAILABLE = False
mod._IS_FFMPEG_AVAILABLE = False
mod._load_lib = lambda *a, **kw: False
mod._check_cuda_version = lambda: None
mod._init_ffmpeg = lambda: None
def __getattr__(attr):
# Nunca interceptar dunders β€” deixar Python tratar AttributeError normalmente
if attr.startswith("__") and attr.endswith("__"):
raise AttributeError(f"module {name!r} has no attribute {attr!r}")
# Flags booleanas β†’ False
if any(attr.startswith(p) for p in ("_IS_", "_HAS_", "is_", "has_")):
return False
# Qualquer outro atributo faltante β†’ callable no-op
return lambda *a, **kw: None
mod.__getattr__ = __getattr__
return mod
sys.modules["torchaudio._extension"] = _make_torchaudio_ext_stub("torchaudio._extension")
sys.modules["torchaudio._extension.utils"] = _make_torchaudio_ext_stub("torchaudio._extension.utils")
# Coqui TTS 0.22 / xtts.py importa stream_generator que por sua vez importa
# BeamSearchScorer β€” removido completamente do transformers>=4.50 (nem no mΓ³dulo
# interno existe mais). Para inferΓͺncia normal (nΓ£o-streaming) sΓ³ precisamos de
# init_stream_support como no-op. Injetamos um stub completo do mΓ³dulo.
def _make_stub_module(name, pkg, **attrs):
m = types.ModuleType(name)
m.__file__ = None
m.__spec__ = None
m.__loader__ = None
m.__package__ = pkg
for k, v in attrs.items():
setattr(m, k, v)
def __getattr__(attr):
if attr.startswith("__") and attr.endswith("__"):
raise AttributeError(attr)
return lambda *a, **kw: None
m.__getattr__ = __getattr__
return m
sys.modules["TTS.tts.layers.xtts.stream_generator"] = _make_stub_module(
"TTS.tts.layers.xtts.stream_generator",
"TTS.tts.layers.xtts",
init_stream_support=lambda: None,
)
# ─────────────────────────────────────────────────────────────────────────────
import warnings
import sys
# Suprime avisos de compatibilidade futura do transformers (GPT2InferenceModel/GenerationMixin)
warnings.filterwarnings("ignore", message=".*GenerationMixin.*")
warnings.filterwarnings("ignore", message=".*prepare_inputs_for_generation.*")
# "Invalid file descriptor: -1" vem do asyncio __del__ durante GC β€” bypassa o
# sistema de warnings e vai para sys.unraisablehook. Filtramos especificamente.
_orig_unraisablehook = sys.unraisablehook
def _quiet_unraisablehook(unraisable):
if isinstance(unraisable.exc_value, ValueError) and \
"Invalid file descriptor" in str(unraisable.exc_value):
return
_orig_unraisablehook(unraisable)
sys.unraisablehook = _quiet_unraisablehook
import gradio as gr
import spaces
import traceback
import uuid
import io
import os
import json
import shutil
import tempfile
from pathlib import Path
from typing import Optional
from huggingface_hub import snapshot_download
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
# ── ConfiguraΓ§Γ£o ──────────────────────────────────────────────────────────────
MODEL_HF_REPO = "tts-hub/XTTS-v2"
MODEL_CACHE_DIR = Path("/tmp/xtts_v2")
VOICES_DIR = Path("voices")
VOICES_DIR.mkdir(exist_ok=True)
CONFIG_FILE = VOICES_DIR / "config.json"
SUPPORTED_FORMATS = [".wav", ".mp3", ".ogg", ".flac", ".m4a", ".aac", ".opus", ".wma", ".webm"]
LANGUAGES = [
("PortuguΓͺs (BR)", "pt"), ("English", "en"), ("EspaΓ±ol", "es"),
("FranΓ§ais", "fr"), ("Deutsch", "de"), ("Italiano", "it"),
("Polski", "pl"), ("TΓΌrkΓ§e", "tr"), ("Русский", "ru"),
("Nederlands", "nl"), ("ČeΕ‘tina", "cs"), ("Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ©", "ar"),
("δΈ­ζ–‡", "zh-cn"), ("Magyar", "hu"), ("ν•œκ΅­μ–΄", "ko"), ("ζ—₯本θͺž", "ja"),
]
LANG_CHOICES = [f"{name} ({code})" for name, code in LANGUAGES]
LANG_MAP = {f"{name} ({code})": code for name, code in LANGUAGES}
def _load_config() -> dict:
if CONFIG_FILE.exists():
try:
return json.loads(CONFIG_FILE.read_text())
except Exception:
pass
return {"active_voice": None}
def _save_config(cfg: dict):
CONFIG_FILE.write_text(json.dumps(cfg, indent=2))
# ── Carrega modelo XTTS-v2 do HuggingFace Hub ────────────────────────────────
# Evita os servidores mortos do Coqui. snapshot_download baixa para cache local
# e Γ© idempotente (no-op se jΓ‘ existe). CPU ao iniciar β€” GPU sΓ³ dentro de @spaces.GPU.
print("Baixando/verificando modelo XTTS-v2 do HF Hub...")
_hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
_model_dir = snapshot_download(
repo_id=MODEL_HF_REPO,
local_dir=str(MODEL_CACHE_DIR),
token=_hf_token,
)
_xtts_config = XttsConfig()
_xtts_config.load_json(str(MODEL_CACHE_DIR / "config.json"))
# PyTorch 2.6+ mudou weights_only=True por padrΓ£o, quebrando checkpoints TTS
# que serializam objetos Python arbitrΓ‘rios. Patcheamos torch.load para usar
# weights_only=False quando nΓ£o especificado β€” seguro pois o modelo vem do HF Hub.
import torch as _torch
_orig_torch_load = _torch.load
def _torch_load_compat(*args, **kwargs):
kwargs.setdefault("weights_only", False)
return _orig_torch_load(*args, **kwargs)
_torch.load = _torch_load_compat
model = Xtts.init_from_config(_xtts_config)
model.load_checkpoint(_xtts_config, checkpoint_dir=str(MODEL_CACHE_DIR), eval=True)
model.cpu()
print("Modelo XTTS-v2 carregado na CPU.")
# Em transformers>=4.50, PreTrainedModel nΓ£o herda mais de GenerationMixin.
# GPT2InferenceModel (dentro do XTTS) perde .generate() β†’ AttributeError.
# Corrigimos adicionando GenerationMixin explicitamente Γ  cadeia de heranΓ§a.
from transformers import GenerationMixin as _GenMixin, GenerationConfig as _GenConfig
from TTS.tts.layers.xtts.gpt import GPT2InferenceModel as _GPT2Inf, GPT as _GPT
if not issubclass(_GPT2Inf, _GenMixin):
_GPT2Inf.__bases__ = (_GenMixin,) + _GPT2Inf.__bases__
print("Patch GenerationMixin aplicado ao GPT2InferenceModel.")
# gpt.py passa eos/bos/pad_token_id como kwargs diretos para gpt_inference.generate().
# Em transformers>=4.45 esses kwargs sΓ£o deprecated e IGNORADOS β€” o modelo roda atΓ©
# max_length e gera ruΓ­do. A correΓ§Γ£o tem duas partes:
#
# 1. Injetar os token IDs no generation_config do gpt_inference (fonte oficial).
# 2. Patchear GPT.generate para remover os kwargs antes de passΓ‘-los, evitando conflito.
_stop = int(model.gpt.stop_audio_token)
_start = int(model.gpt.start_audio_token)
model.gpt.gpt_inference.generation_config = _GenConfig(
eos_token_id=_stop,
bos_token_id=_start,
pad_token_id=_stop,
)
print(f"generation_config injetado: eos={_stop} bos={_start}.")
_orig_gpt_cls_generate = _GPT.generate
def _patched_gpt_cls_generate(self, cond_latents, text_inputs, **hf_generate_kwargs):
# Remove kwargs de token deprecados β€” vivem agora em generation_config
for _k in ("eos_token_id", "bos_token_id", "pad_token_id"):
hf_generate_kwargs.pop(_k, None)
return _orig_gpt_cls_generate(self, cond_latents, text_inputs, **hf_generate_kwargs)
_GPT.generate = _patched_gpt_cls_generate
print("Patch GPT.generate aplicado: kwargs de token removidos (vivem em generation_config).")
# Patch load_audio do XTTS para usar soundfile em vez de torchaudio.
# torchaudio 2.9+ usa torchcodec como backend padrΓ£o (nΓ£o instalado).
# Nossos Γ‘udios de referΓͺncia jΓ‘ sΓ£o WAV 22050Hz mono (via convert_to_wav).
import torch as _torch2
import soundfile as _sf
import numpy as _np
def _load_audio_soundfile(audiopath, sampling_rate=22050):
data, sr = _sf.read(str(audiopath), dtype="float32")
if data.ndim > 1:
data = data.mean(axis=1) # stereo β†’ mono
if sr != sampling_rate: # resample se necessΓ‘rio
from scipy import signal as _sig
n = int(len(data) * sampling_rate / sr)
data = _sig.resample(data, n)
# load_audio do XTTS retorna sΓ³ o tensor (nΓ£o tupla) β€” compatΓ­vel com xtts.py:358
return _torch2.FloatTensor(data).unsqueeze(0)
import TTS.tts.models.xtts as _xtts_mod
_xtts_mod.load_audio = _load_audio_soundfile
# ── ConversΓ£o de Γ‘udio ────────────────────────────────────────────────────────
def convert_to_wav(input_path: str) -> str:
"""Converte qualquer formato suportado para WAV 22050Hz mono."""
src = Path(input_path)
ext = src.suffix.lower()
fmt_map = {
".mp3": "mp3", ".ogg": "ogg", ".flac": "flac",
".m4a": "m4a", ".aac": "aac", ".opus": "ogg",
".wma": "asf", ".webm": "webm", ".wav": "wav",
}
fmt = fmt_map.get(ext, "wav")
audio = AudioSegment.from_file(str(src), format=fmt)
audio = audio.set_channels(1).set_frame_rate(22050)
out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
audio.export(out.name, format="wav")
return out.name
# ── NΓΊcleo de geraΓ§Γ£o (GPU exclusiva via ZeroGPU) ─────────────────────────────
@spaces.GPU
def synthesize(text: str, speaker_wav: str, language: str = "pt") -> bytes:
import traceback as _tb
model.cuda()
try:
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=[speaker_wav],
gpt_cond_len=30,
max_ref_length=60,
)
out = model.inference(
text=text,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=0.7,
)
wav = np.array(out["wav"], dtype=np.float32)
# Normaliza e converte para PCM 16-bit
max_val = np.abs(wav).max()
if max_val > 0:
wav = wav / max_val
wav_int16 = (wav * 32767).astype(np.int16)
# WAV intermediΓ‘rio β†’ MP3 via pydub
wav_buf = io.BytesIO()
sf.write(wav_buf, wav_int16, 24000, format="WAV", subtype="PCM_16")
wav_buf.seek(0)
segment = AudioSegment.from_wav(wav_buf)
mp3_buf = io.BytesIO()
segment.export(mp3_buf, format="mp3", bitrate="128k")
return mp3_buf.getvalue()
except Exception:
print("=== SYNTHESIZE ERROR ===", flush=True)
print(_tb.format_exc(), flush=True)
raise
finally:
model.cpu()
# ── Helpers de voz ────────────────────────────────────────────────────────────
def list_voices() -> list[str]:
return sorted(p.stem for p in VOICES_DIR.glob("*.wav"))
def get_voice_path(voice_name: str) -> Optional[str]:
p = VOICES_DIR / f"{voice_name}.wav"
return str(p) if p.exists() else None
def get_active_voice() -> Optional[str]:
cfg = _load_config()
av = cfg.get("active_voice")
# valida que o arquivo ainda existe
if av and (VOICES_DIR / f"{av}.wav").exists():
return av
return None
def set_active_voice(voice_name: str) -> str:
cfg = _load_config()
cfg["active_voice"] = voice_name
_save_config(cfg)
return voice_name
# ── FunΓ§Γ΅es da UI ─────────────────────────────────────────────────────────────
def ui_generate(text, voice_sel, ref_audio, lang_sel):
try:
if not text or not text.strip():
return None, "Texto vazio."
language = LANG_MAP.get(lang_sel, "pt")
if ref_audio:
wav_path = convert_to_wav(ref_audio)
cleanup = True
label = "referΓͺncia enviada"
elif voice_sel:
wav_path = get_voice_path(voice_sel)
cleanup = False
label = voice_sel
else:
active = get_active_voice()
if active:
wav_path = get_voice_path(active)
cleanup = False
label = f"{active} (ativa)"
else:
return None, "Nenhuma voz disponΓ­vel. Salve uma voz na aba Vozes ou envie uma referΓͺncia."
if not wav_path:
return None, f"Arquivo de voz nΓ£o encontrado."
try:
audio_bytes = synthesize(text.strip(), wav_path, language)
finally:
if cleanup:
os.unlink(wav_path)
size_kb = len(audio_bytes) // 1024
return audio_bytes, f"Gerado com '{label}' β€” {size_kb} KB"
except Exception:
return None, traceback.format_exc()
def ui_save_voice(name, audio_file):
try:
if not name or not name.strip():
return "Nome da voz Γ© obrigatΓ³rio.", gr.update(), gr.update(), gr.update()
if not audio_file:
return "Envie um arquivo de Γ‘udio.", gr.update(), gr.update(), gr.update()
name = name.strip().lower().replace(" ", "_")
src = Path(audio_file)
if src.suffix.lower() not in SUPPORTED_FORMATS:
return f"Formato '{src.suffix}' nΓ£o suportado. Use: {', '.join(SUPPORTED_FORMATS)}", gr.update(), gr.update(), gr.update()
wav_tmp = convert_to_wav(audio_file)
dest = VOICES_DIR / f"{name}.wav"
shutil.move(wav_tmp, dest)
voices = list_voices()
active = get_active_voice()
msg = f"Voz '{name}' salva."
if not active:
set_active_voice(name)
active = name
msg += f" Definida como voz ativa."
return (
msg,
gr.update(choices=voices, value=name), # gerenciar dropdown
gr.update(choices=voices, value=name), # gerar dropdown
gr.update(choices=voices, value=active), # ativa dropdown
)
except Exception:
return traceback.format_exc(), gr.update(), gr.update(), gr.update()
def ui_delete_voice(voice_name):
try:
if not voice_name:
return "Selecione uma voz.", gr.update(), gr.update(), gr.update()
p = VOICES_DIR / f"{voice_name}.wav"
if p.exists():
p.unlink()
cfg = _load_config()
if cfg.get("active_voice") == voice_name:
voices = list_voices()
cfg["active_voice"] = voices[0] if voices else None
_save_config(cfg)
voices = list_voices()
active = get_active_voice()
return (
f"Voz '{voice_name}' excluΓ­da.",
gr.update(choices=voices, value=voices[0] if voices else None),
gr.update(choices=voices, value=voices[0] if voices else None),
gr.update(choices=voices, value=active),
)
except Exception:
return traceback.format_exc(), gr.update(), gr.update(), gr.update()
def ui_set_active(voice_name):
if not voice_name:
return "Selecione uma voz.", gr.update()
set_active_voice(voice_name)
return f"Voz ativa: '{voice_name}'", gr.update(value=voice_name)
def ui_preview_voice(voice_name):
if not voice_name:
return None
p = get_voice_path(voice_name)
return p if p else None
def ui_refresh_all():
voices = list_voices()
active = get_active_voice()
return (
gr.update(choices=voices, value=voices[0] if voices else None),
gr.update(choices=voices, value=voices[0] if voices else None),
gr.update(choices=voices, value=active),
)
# ── Interface Gradio ──────────────────────────────────────────────────────────
_voices_init = list_voices()
_active_init = get_active_voice()
with gr.Blocks(title="AVA TTS") as demo:
gr.Markdown("# AVA TTS\nSΓ­ntese de voz com clone via XTTS-v2 β€’ API em `/v1/audio/speech`")
# Estado compartilhado entre abas
voice_gen_dd = gr.State(None)
# ── Aba: Gerar ────────────────────────────────────────────────────────────
with gr.Tab("Gerar"):
with gr.Row():
with gr.Column(scale=2):
txt_input = gr.Textbox(
label="Texto",
lines=5,
placeholder="Digite o texto a sintetizar...",
)
lang_dd = gr.Dropdown(
choices=LANG_CHOICES,
value="PortuguΓͺs (BR) (pt)",
label="Idioma",
)
with gr.Row():
voice_dd_gen = gr.Dropdown(
choices=_voices_init,
value=_active_init,
label="Voz (deixe vazio para usar a voz ativa)",
)
active_badge = gr.Textbox(
value=f"Ativa: {_active_init or 'β€”'}",
label="",
interactive=False,
scale=1,
)
ref_audio_gen = gr.Audio(
label="ReferΓͺncia ad-hoc (sobrepΓ΅e voz selecionada)",
type="filepath",
sources=["upload", "microphone"],
)
gen_btn = gr.Button("Sintetizar", variant="primary")
with gr.Column(scale=1):
audio_out = gr.Audio(label="Resultado", type="numpy")
gen_status = gr.Textbox(label="Status", interactive=False, lines=3)
gen_btn.click(
ui_generate,
[txt_input, voice_dd_gen, ref_audio_gen, lang_dd],
[audio_out, gen_status],
)
# ── Aba: Vozes ────────────────────────────────────────────────────────────
with gr.Tab("Vozes"):
gr.Markdown(
"Salve amostras de voz (3–10s de Γ‘udio limpo). "
f"Formatos aceitos: **{', '.join(SUPPORTED_FORMATS)}** β€” convertidos automaticamente para WAV."
)
with gr.Row():
# Coluna: adicionar voz
with gr.Column():
gr.Markdown("### Adicionar voz")
new_name = gr.Textbox(label="Nome", placeholder="ex: ava, narrador, lucas")
new_audio = gr.File(
label="Arquivo de Γ‘udio",
file_types=SUPPORTED_FORMATS,
)
save_btn = gr.Button("Salvar voz", variant="primary")
save_status = gr.Textbox(label="Status", interactive=False, lines=2)
# Coluna: gerenciar vozes existentes
with gr.Column():
gr.Markdown("### Gerenciar vozes")
with gr.Row():
active_dd = gr.Dropdown(
choices=_voices_init,
value=_active_init,
label="Voz ativa (usada como padrΓ£o na API)",
)
set_active_btn = gr.Button("Definir ativa", variant="secondary")
voice_mgr_dd = gr.Dropdown(
choices=_voices_init,
value=_voices_init[0] if _voices_init else None,
label="Selecionar voz",
)
preview_audio = gr.Audio(label="Preview da referΓͺncia", type="filepath", interactive=False)
with gr.Row():
refresh_btn = gr.Button("Atualizar lista")
del_btn = gr.Button("Excluir selecionada", variant="stop")
mgr_status = gr.Textbox(label="Status", interactive=False, lines=2)
# Eventos β€” Vozes
voice_mgr_dd.change(ui_preview_voice, [voice_mgr_dd], [preview_audio])
set_active_btn.click(
ui_set_active,
[active_dd],
[mgr_status, active_dd],
).then(
lambda v: f"Ativa: {v or 'β€”'}",
[active_dd],
[active_badge],
)
save_btn.click(
ui_save_voice,
[new_name, new_audio],
[save_status, voice_mgr_dd, voice_dd_gen, active_dd],
).then(
lambda v: f"Ativa: {v or 'β€”'}",
[active_dd],
[active_badge],
)
del_btn.click(
ui_delete_voice,
[voice_mgr_dd],
[mgr_status, voice_mgr_dd, voice_dd_gen, active_dd],
).then(
lambda v: f"Ativa: {v or 'β€”'}",
[active_dd],
[active_badge],
)
refresh_btn.click(
ui_refresh_all,
[],
[voice_mgr_dd, voice_dd_gen, active_dd],
).then(
lambda v: f"Ativa: {v or 'β€”'}",
[active_dd],
[active_badge],
)
# ── Aba: API ──────────────────────────────────────────────────────────────
with gr.Tab("API"):
gr.Markdown("""
## POST `/v1/audio/speech`
```bash
curl -X POST https://huggingface.co/proxy/mathiasvinicius-ava-tts.hf.space/v1/audio/speech \\
-H "Authorization: Bearer $HF_TOKEN" \\
-H "Content-Type: application/json" \\
-d '{"model":"xtts-v2","input":"OlΓ‘! Eu sou a AVA.","voice":"ava","language":"pt"}' \\
--output audio.wav
```
| Campo | Tipo | DescriΓ§Γ£o |
|---|---|---|
| `input` | string | Texto a sintetizar |
| `voice` | string | Nome de voz salva (padrΓ£o: voz ativa) |
| `language` | string | CΓ³digo do idioma (padrΓ£o: `pt`) |
| `speaker_wav_b64` | string | Áudio WAV em base64 (alternativa ao `voice`) |
## GET `/v1/voices`
Retorna lista de vozes salvas e qual estΓ‘ ativa.
```bash
curl https://huggingface.co/proxy/mathiasvinicius-ava-tts.hf.space/v1/voices \\
-H "Authorization: Bearer $HF_TOKEN"
```
""")
# ── OpenAI-compatible API ─────────────────────────────────────────────────────
class SpeechRequest(BaseModel):
model: Optional[str] = "xtts-v2"
input: str
voice: Optional[str] = None # None β†’ usa voz ativa
language: Optional[str] = "pt"
speaker_wav_b64: Optional[str] = None
def api_list_voices():
return {"voices": list_voices(), "active": get_active_voice()}
def api_speech(req: SpeechRequest):
try:
if not req.input or not req.input.strip():
return JSONResponse(status_code=400, content={"error": "input vazio"})
cleanup = False
if req.speaker_wav_b64:
import base64
raw = base64.b64decode(req.speaker_wav_b64)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(raw)
speaker_wav = tmp.name
cleanup = True
else:
voice_name = req.voice or get_active_voice()
if not voice_name:
return JSONResponse(status_code=404, content={"error": "Nenhuma voz ativa. Defina uma voz na UI ou passe 'voice' no request."})
speaker_wav = get_voice_path(voice_name)
if not speaker_wav:
return JSONResponse(status_code=404, content={"error": f"Voz '{voice_name}' nΓ£o encontrada. Use GET /v1/voices."})
try:
audio_bytes = synthesize(req.input.strip(), speaker_wav, req.language or "pt")
finally:
if cleanup:
os.unlink(speaker_wav)
return StreamingResponse(
io.BytesIO(audio_bytes),
media_type="audio/mpeg",
headers={"Content-Disposition": f'attachment; filename="speech-{uuid.uuid4().hex[:8]}.mp3"'},
)
except Exception:
return JSONResponse(status_code=500, content={"error": traceback.format_exc()})
app, _, _ = demo.launch(
server_name="0.0.0.0",
server_port=7860,
prevent_thread_lock=True,
ssr_mode=False,
theme=gr.themes.Soft(),
)
app.add_api_route("/v1/voices", api_list_voices, methods=["GET"])
app.add_api_route("/v1/audio/speech", api_speech, methods=["POST"])
demo.block_thread()