Spaces:

mathiasvinicius
/

ava-tts

Paused

App Files Files Community

ava-tts / app.py

mathiasvinicius

fix: patch GPT.generate para remover kwargs de token deprecated (transformers>=4.45)

97f1d86 about 2 months ago

raw

history blame contribute delete

26.4 kB

	import sys
	import types

	# ── ZeroGPU startup patch ─────────────────────────────────────────────────────
	# torchaudio compilado contra CUDA 13 trava ao importar quando não há GPU.
	# Injetamos um stub para torchaudio._extension em sys.modules ANTES de qualquer
	# import do TTS/torchaudio. O Python usa o stub sem executar o __init__.py real,
	# evitando toda a lógica de carregamento de extensão CUDA.
	# O áudio cai automaticamente para o backend soundfile (puro Python, sem CUDA).
	def _make_torchaudio_ext_stub(name):
	"""Stub para torchaudio._extension — captura qualquer atributo faltante.

	Problema raiz: torchaudio compilado contra CUDA 13 trava no ZeroGPU startup.
	Injetamos este stub em sys.modules antes de qualquer import de torchaudio.

	Cuidados:
	- __file__ = None → inspect.getfile() levanta TypeError (capturado por getmodule),
	evitando que __getattr__ retorne uma lambda onde se espera uma string de path.
	- __getattr__ levanta AttributeError para dunders → Python os trata normalmente.
	- Flags booleanas (_IS_, _HAS_) → False.
	- Qualquer outro atributo → callable no-op (ex: fail_if_no_align).
	"""
	mod = types.ModuleType(name)
	# Dunders de módulo explícitos para não cair no __getattr__
	mod.__file__ = None
	mod.__spec__ = None
	mod.__loader__ = None
	mod.__package__ = name.rsplit(".", 1)[0] if "." in name else name
	# Atributos conhecidos da extensão
	mod._IS_TORCHAUDIO_EXT_AVAILABLE = False
	mod._IS_SOX_AVAILABLE = False
	mod._IS_FFMPEG_AVAILABLE = False
	mod._load_lib = lambda a, *kw: False
	mod._check_cuda_version = lambda: None
	mod._init_ffmpeg = lambda: None

	def __getattr__(attr):
	# Nunca interceptar dunders — deixar Python tratar AttributeError normalmente
	if attr.startswith("__") and attr.endswith("__"):
	raise AttributeError(f"module {name!r} has no attribute {attr!r}")
	# Flags booleanas → False
	if any(attr.startswith(p) for p in ("_IS_", "_HAS_", "is_", "has_")):
	return False
	# Qualquer outro atributo faltante → callable no-op
	return lambda a, *kw: None

	mod.__getattr__ = __getattr__
	return mod

	sys.modules["torchaudio._extension"] = _make_torchaudio_ext_stub("torchaudio._extension")
	sys.modules["torchaudio._extension.utils"] = _make_torchaudio_ext_stub("torchaudio._extension.utils")

	# Coqui TTS 0.22 / xtts.py importa stream_generator que por sua vez importa
	# BeamSearchScorer — removido completamente do transformers>=4.50 (nem no módulo
	# interno existe mais). Para inferência normal (não-streaming) só precisamos de
	# init_stream_support como no-op. Injetamos um stub completo do módulo.
	def _make_stub_module(name, pkg, **attrs):
	m = types.ModuleType(name)
	m.__file__ = None
	m.__spec__ = None
	m.__loader__ = None
	m.__package__ = pkg
	for k, v in attrs.items():
	setattr(m, k, v)
	def __getattr__(attr):
	if attr.startswith("__") and attr.endswith("__"):
	raise AttributeError(attr)
	return lambda a, *kw: None
	m.__getattr__ = __getattr__
	return m

	sys.modules["TTS.tts.layers.xtts.stream_generator"] = _make_stub_module(
	"TTS.tts.layers.xtts.stream_generator",
	"TTS.tts.layers.xtts",
	init_stream_support=lambda: None,
	)
	# ─────────────────────────────────────────────────────────────────────────────

	import warnings
	import sys

	# Suprime avisos de compatibilidade futura do transformers (GPT2InferenceModel/GenerationMixin)
	warnings.filterwarnings("ignore", message=".GenerationMixin.")
	warnings.filterwarnings("ignore", message=".prepare_inputs_for_generation.")

	# "Invalid file descriptor: -1" vem do asyncio __del__ durante GC — bypassa o
	# sistema de warnings e vai para sys.unraisablehook. Filtramos especificamente.
	_orig_unraisablehook = sys.unraisablehook
	def _quiet_unraisablehook(unraisable):
	if isinstance(unraisable.exc_value, ValueError) and \
	"Invalid file descriptor" in str(unraisable.exc_value):
	return
	_orig_unraisablehook(unraisable)
	sys.unraisablehook = _quiet_unraisablehook

	import gradio as gr
	import spaces
	import traceback
	import uuid
	import io
	import os
	import json
	import shutil
	import tempfile
	from pathlib import Path
	from typing import Optional

	from huggingface_hub import snapshot_download
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts
	from pydub import AudioSegment
	import soundfile as sf
	import numpy as np
	from fastapi.responses import StreamingResponse, JSONResponse
	from pydantic import BaseModel

	# ── Configuração ──────────────────────────────────────────────────────────────
	MODEL_HF_REPO = "tts-hub/XTTS-v2"
	MODEL_CACHE_DIR = Path("/tmp/xtts_v2")
	VOICES_DIR = Path("voices")
	VOICES_DIR.mkdir(exist_ok=True)
	CONFIG_FILE = VOICES_DIR / "config.json"

	SUPPORTED_FORMATS = [".wav", ".mp3", ".ogg", ".flac", ".m4a", ".aac", ".opus", ".wma", ".webm"]

	LANGUAGES = [
	("Português (BR)", "pt"), ("English", "en"), ("Español", "es"),
	("Français", "fr"), ("Deutsch", "de"), ("Italiano", "it"),
	("Polski", "pl"), ("Türkçe", "tr"), ("Русский", "ru"),
	("Nederlands", "nl"), ("Čeština", "cs"), ("العربية", "ar"),
	("中文", "zh-cn"), ("Magyar", "hu"), ("한국어", "ko"), ("日本語", "ja"),
	]
	LANG_CHOICES = [f"{name} ({code})" for name, code in LANGUAGES]
	LANG_MAP = {f"{name} ({code})": code for name, code in LANGUAGES}


	def _load_config() -> dict:
	if CONFIG_FILE.exists():
	try:
	return json.loads(CONFIG_FILE.read_text())
	except Exception:
	pass
	return {"active_voice": None}


	def _save_config(cfg: dict):
	CONFIG_FILE.write_text(json.dumps(cfg, indent=2))


	# ── Carrega modelo XTTS-v2 do HuggingFace Hub ────────────────────────────────
	# Evita os servidores mortos do Coqui. snapshot_download baixa para cache local
	# e é idempotente (no-op se já existe). CPU ao iniciar — GPU só dentro de @spaces.GPU.
	print("Baixando/verificando modelo XTTS-v2 do HF Hub...")
	_hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
	_model_dir = snapshot_download(
	repo_id=MODEL_HF_REPO,
	local_dir=str(MODEL_CACHE_DIR),
	token=_hf_token,
	)

	_xtts_config = XttsConfig()
	_xtts_config.load_json(str(MODEL_CACHE_DIR / "config.json"))

	# PyTorch 2.6+ mudou weights_only=True por padrão, quebrando checkpoints TTS
	# que serializam objetos Python arbitrários. Patcheamos torch.load para usar
	# weights_only=False quando não especificado — seguro pois o modelo vem do HF Hub.
	import torch as _torch
	_orig_torch_load = _torch.load
	def _torch_load_compat(args, *kwargs):
	kwargs.setdefault("weights_only", False)
	return _orig_torch_load(args, *kwargs)
	_torch.load = _torch_load_compat

	model = Xtts.init_from_config(_xtts_config)
	model.load_checkpoint(_xtts_config, checkpoint_dir=str(MODEL_CACHE_DIR), eval=True)
	model.cpu()
	print("Modelo XTTS-v2 carregado na CPU.")

	# Em transformers>=4.50, PreTrainedModel não herda mais de GenerationMixin.
	# GPT2InferenceModel (dentro do XTTS) perde .generate() → AttributeError.
	# Corrigimos adicionando GenerationMixin explicitamente à cadeia de herança.
	from transformers import GenerationMixin as _GenMixin, GenerationConfig as _GenConfig
	from TTS.tts.layers.xtts.gpt import GPT2InferenceModel as _GPT2Inf, GPT as _GPT
	if not issubclass(_GPT2Inf, _GenMixin):
	_GPT2Inf.__bases__ = (_GenMixin,) + _GPT2Inf.__bases__
	print("Patch GenerationMixin aplicado ao GPT2InferenceModel.")

	# gpt.py passa eos/bos/pad_token_id como kwargs diretos para gpt_inference.generate().
	# Em transformers>=4.45 esses kwargs são deprecated e IGNORADOS — o modelo roda até
	# max_length e gera ruído. A correção tem duas partes:
	#
	# 1. Injetar os token IDs no generation_config do gpt_inference (fonte oficial).
	# 2. Patchear GPT.generate para remover os kwargs antes de passá-los, evitando conflito.
	_stop = int(model.gpt.stop_audio_token)
	_start = int(model.gpt.start_audio_token)
	model.gpt.gpt_inference.generation_config = _GenConfig(
	eos_token_id=_stop,
	bos_token_id=_start,
	pad_token_id=_stop,
	)
	print(f"generation_config injetado: eos={_stop} bos={_start}.")

	_orig_gpt_cls_generate = _GPT.generate

	def _patched_gpt_cls_generate(self, cond_latents, text_inputs, **hf_generate_kwargs):
	# Remove kwargs de token deprecados — vivem agora em generation_config
	for _k in ("eos_token_id", "bos_token_id", "pad_token_id"):
	hf_generate_kwargs.pop(_k, None)
	return _orig_gpt_cls_generate(self, cond_latents, text_inputs, **hf_generate_kwargs)

	_GPT.generate = _patched_gpt_cls_generate
	print("Patch GPT.generate aplicado: kwargs de token removidos (vivem em generation_config).")

	# Patch load_audio do XTTS para usar soundfile em vez de torchaudio.
	# torchaudio 2.9+ usa torchcodec como backend padrão (não instalado).
	# Nossos áudios de referência já são WAV 22050Hz mono (via convert_to_wav).
	import torch as _torch2
	import soundfile as _sf
	import numpy as _np

	def _load_audio_soundfile(audiopath, sampling_rate=22050):
	data, sr = _sf.read(str(audiopath), dtype="float32")
	if data.ndim > 1:
	data = data.mean(axis=1) # stereo → mono
	if sr != sampling_rate: # resample se necessário
	from scipy import signal as _sig
	n = int(len(data) * sampling_rate / sr)
	data = _sig.resample(data, n)
	# load_audio do XTTS retorna só o tensor (não tupla) — compatível com xtts.py:358
	return _torch2.FloatTensor(data).unsqueeze(0)

	import TTS.tts.models.xtts as _xtts_mod
	_xtts_mod.load_audio = _load_audio_soundfile


	# ── Conversão de áudio ────────────────────────────────────────────────────────
	def convert_to_wav(input_path: str) -> str:
	"""Converte qualquer formato suportado para WAV 22050Hz mono."""
	src = Path(input_path)
	ext = src.suffix.lower()

	fmt_map = {
	".mp3": "mp3", ".ogg": "ogg", ".flac": "flac",
	".m4a": "m4a", ".aac": "aac", ".opus": "ogg",
	".wma": "asf", ".webm": "webm", ".wav": "wav",
	}
	fmt = fmt_map.get(ext, "wav")

	audio = AudioSegment.from_file(str(src), format=fmt)
	audio = audio.set_channels(1).set_frame_rate(22050)

	out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	audio.export(out.name, format="wav")
	return out.name


	# ── Núcleo de geração (GPU exclusiva via ZeroGPU) ─────────────────────────────
	@spaces.GPU
	def synthesize(text: str, speaker_wav: str, language: str = "pt") -> bytes:
	import traceback as _tb
	model.cuda()
	try:
	gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
	audio_path=[speaker_wav],
	gpt_cond_len=30,
	max_ref_length=60,
	)
	out = model.inference(
	text=text,
	language=language,
	gpt_cond_latent=gpt_cond_latent,
	speaker_embedding=speaker_embedding,
	temperature=0.7,
	)
	wav = np.array(out["wav"], dtype=np.float32)
	# Normaliza e converte para PCM 16-bit
	max_val = np.abs(wav).max()
	if max_val > 0:
	wav = wav / max_val
	wav_int16 = (wav * 32767).astype(np.int16)
	# WAV intermediário → MP3 via pydub
	wav_buf = io.BytesIO()
	sf.write(wav_buf, wav_int16, 24000, format="WAV", subtype="PCM_16")
	wav_buf.seek(0)
	segment = AudioSegment.from_wav(wav_buf)
	mp3_buf = io.BytesIO()
	segment.export(mp3_buf, format="mp3", bitrate="128k")
	return mp3_buf.getvalue()
	except Exception:
	print("=== SYNTHESIZE ERROR ===", flush=True)
	print(_tb.format_exc(), flush=True)
	raise
	finally:
	model.cpu()


	# ── Helpers de voz ────────────────────────────────────────────────────────────
	def list_voices() -> list[str]:
	return sorted(p.stem for p in VOICES_DIR.glob("*.wav"))


	def get_voice_path(voice_name: str) -> Optional[str]:
	p = VOICES_DIR / f"{voice_name}.wav"
	return str(p) if p.exists() else None


	def get_active_voice() -> Optional[str]:
	cfg = _load_config()
	av = cfg.get("active_voice")
	# valida que o arquivo ainda existe
	if av and (VOICES_DIR / f"{av}.wav").exists():
	return av
	return None


	def set_active_voice(voice_name: str) -> str:
	cfg = _load_config()
	cfg["active_voice"] = voice_name
	_save_config(cfg)
	return voice_name


	# ── Funções da UI ─────────────────────────────────────────────────────────────
	def ui_generate(text, voice_sel, ref_audio, lang_sel):
	try:
	if not text or not text.strip():
	return None, "Texto vazio."
	language = LANG_MAP.get(lang_sel, "pt")

	if ref_audio:
	wav_path = convert_to_wav(ref_audio)
	cleanup = True
	label = "referência enviada"
	elif voice_sel:
	wav_path = get_voice_path(voice_sel)
	cleanup = False
	label = voice_sel
	else:
	active = get_active_voice()
	if active:
	wav_path = get_voice_path(active)
	cleanup = False
	label = f"{active} (ativa)"
	else:
	return None, "Nenhuma voz disponível. Salve uma voz na aba Vozes ou envie uma referência."

	if not wav_path:
	return None, f"Arquivo de voz não encontrado."

	try:
	audio_bytes = synthesize(text.strip(), wav_path, language)
	finally:
	if cleanup:
	os.unlink(wav_path)

	size_kb = len(audio_bytes) // 1024
	return audio_bytes, f"Gerado com '{label}' — {size_kb} KB"
	except Exception:
	return None, traceback.format_exc()


	def ui_save_voice(name, audio_file):
	try:
	if not name or not name.strip():
	return "Nome da voz é obrigatório.", gr.update(), gr.update(), gr.update()
	if not audio_file:
	return "Envie um arquivo de áudio.", gr.update(), gr.update(), gr.update()

	name = name.strip().lower().replace(" ", "_")
	src = Path(audio_file)
	if src.suffix.lower() not in SUPPORTED_FORMATS:
	return f"Formato '{src.suffix}' não suportado. Use: {', '.join(SUPPORTED_FORMATS)}", gr.update(), gr.update(), gr.update()

	wav_tmp = convert_to_wav(audio_file)
	dest = VOICES_DIR / f"{name}.wav"
	shutil.move(wav_tmp, dest)

	voices = list_voices()
	active = get_active_voice()
	msg = f"Voz '{name}' salva."
	if not active:
	set_active_voice(name)
	active = name
	msg += f" Definida como voz ativa."

	return (
	msg,
	gr.update(choices=voices, value=name), # gerenciar dropdown
	gr.update(choices=voices, value=name), # gerar dropdown
	gr.update(choices=voices, value=active), # ativa dropdown
	)
	except Exception:
	return traceback.format_exc(), gr.update(), gr.update(), gr.update()


	def ui_delete_voice(voice_name):
	try:
	if not voice_name:
	return "Selecione uma voz.", gr.update(), gr.update(), gr.update()
	p = VOICES_DIR / f"{voice_name}.wav"
	if p.exists():
	p.unlink()

	cfg = _load_config()
	if cfg.get("active_voice") == voice_name:
	voices = list_voices()
	cfg["active_voice"] = voices[0] if voices else None
	_save_config(cfg)

	voices = list_voices()
	active = get_active_voice()
	return (
	f"Voz '{voice_name}' excluída.",
	gr.update(choices=voices, value=voices[0] if voices else None),
	gr.update(choices=voices, value=voices[0] if voices else None),
	gr.update(choices=voices, value=active),
	)
	except Exception:
	return traceback.format_exc(), gr.update(), gr.update(), gr.update()


	def ui_set_active(voice_name):
	if not voice_name:
	return "Selecione uma voz.", gr.update()
	set_active_voice(voice_name)
	return f"Voz ativa: '{voice_name}'", gr.update(value=voice_name)


	def ui_preview_voice(voice_name):
	if not voice_name:
	return None
	p = get_voice_path(voice_name)
	return p if p else None


	def ui_refresh_all():
	voices = list_voices()
	active = get_active_voice()
	return (
	gr.update(choices=voices, value=voices[0] if voices else None),
	gr.update(choices=voices, value=voices[0] if voices else None),
	gr.update(choices=voices, value=active),
	)


	# ── Interface Gradio ──────────────────────────────────────────────────────────
	_voices_init = list_voices()
	_active_init = get_active_voice()

	with gr.Blocks(title="AVA TTS") as demo:
	gr.Markdown("# AVA TTS\nSíntese de voz com clone via XTTS-v2 • API em `/v1/audio/speech`")

	# Estado compartilhado entre abas
	voice_gen_dd = gr.State(None)

	# ── Aba: Gerar ────────────────────────────────────────────────────────────
	with gr.Tab("Gerar"):
	with gr.Row():
	with gr.Column(scale=2):
	txt_input = gr.Textbox(
	label="Texto",
	lines=5,
	placeholder="Digite o texto a sintetizar...",
	)
	lang_dd = gr.Dropdown(
	choices=LANG_CHOICES,
	value="Português (BR) (pt)",
	label="Idioma",
	)
	with gr.Row():
	voice_dd_gen = gr.Dropdown(
	choices=_voices_init,
	value=_active_init,
	label="Voz (deixe vazio para usar a voz ativa)",
	)
	active_badge = gr.Textbox(
	value=f"Ativa: {_active_init or '—'}",
	label="",
	interactive=False,
	scale=1,
	)
	ref_audio_gen = gr.Audio(
	label="Referência ad-hoc (sobrepõe voz selecionada)",
	type="filepath",
	sources=["upload", "microphone"],
	)
	gen_btn = gr.Button("Sintetizar", variant="primary")

	with gr.Column(scale=1):
	audio_out = gr.Audio(label="Resultado", type="numpy")
	gen_status = gr.Textbox(label="Status", interactive=False, lines=3)

	gen_btn.click(
	ui_generate,
	[txt_input, voice_dd_gen, ref_audio_gen, lang_dd],
	[audio_out, gen_status],
	)

	# ── Aba: Vozes ────────────────────────────────────────────────────────────
	with gr.Tab("Vozes"):
	gr.Markdown(
	"Salve amostras de voz (3–10s de áudio limpo). "
	f"Formatos aceitos: {', '.join(SUPPORTED_FORMATS)} — convertidos automaticamente para WAV."
	)

	with gr.Row():
	# Coluna: adicionar voz
	with gr.Column():
	gr.Markdown("### Adicionar voz")
	new_name = gr.Textbox(label="Nome", placeholder="ex: ava, narrador, lucas")
	new_audio = gr.File(
	label="Arquivo de áudio",
	file_types=SUPPORTED_FORMATS,
	)
	save_btn = gr.Button("Salvar voz", variant="primary")
	save_status = gr.Textbox(label="Status", interactive=False, lines=2)

	# Coluna: gerenciar vozes existentes
	with gr.Column():
	gr.Markdown("### Gerenciar vozes")

	with gr.Row():
	active_dd = gr.Dropdown(
	choices=_voices_init,
	value=_active_init,
	label="Voz ativa (usada como padrão na API)",
	)
	set_active_btn = gr.Button("Definir ativa", variant="secondary")

	voice_mgr_dd = gr.Dropdown(
	choices=_voices_init,
	value=_voices_init[0] if _voices_init else None,
	label="Selecionar voz",
	)
	preview_audio = gr.Audio(label="Preview da referência", type="filepath", interactive=False)
	with gr.Row():
	refresh_btn = gr.Button("Atualizar lista")
	del_btn = gr.Button("Excluir selecionada", variant="stop")
	mgr_status = gr.Textbox(label="Status", interactive=False, lines=2)

	# Eventos — Vozes
	voice_mgr_dd.change(ui_preview_voice, [voice_mgr_dd], [preview_audio])

	set_active_btn.click(
	ui_set_active,
	[active_dd],
	[mgr_status, active_dd],
	).then(
	lambda v: f"Ativa: {v or '—'}",
	[active_dd],
	[active_badge],
	)

	save_btn.click(
	ui_save_voice,
	[new_name, new_audio],
	[save_status, voice_mgr_dd, voice_dd_gen, active_dd],
	).then(
	lambda v: f"Ativa: {v or '—'}",
	[active_dd],
	[active_badge],
	)

	del_btn.click(
	ui_delete_voice,
	[voice_mgr_dd],
	[mgr_status, voice_mgr_dd, voice_dd_gen, active_dd],
	).then(
	lambda v: f"Ativa: {v or '—'}",
	[active_dd],
	[active_badge],
	)

	refresh_btn.click(
	ui_refresh_all,
	[],
	[voice_mgr_dd, voice_dd_gen, active_dd],
	).then(
	lambda v: f"Ativa: {v or '—'}",
	[active_dd],
	[active_badge],
	)

	# ── Aba: API ──────────────────────────────────────────────────────────────
	with gr.Tab("API"):
	gr.Markdown("""
	## POST `/v1/audio/speech`

	```bash
	curl -X POST https://huggingface.co/proxy/mathiasvinicius-ava-tts.hf.space/v1/audio/speech \\
	-H "Authorization: Bearer $HF_TOKEN" \\
	-H "Content-Type: application/json" \\
	-d '{"model":"xtts-v2","input":"Olá! Eu sou a AVA.","voice":"ava","language":"pt"}' \\
	--output audio.wav
	```

	\| Campo \| Tipo \| Descrição \|
	\|---\|---\|---\|
	\| `input` \| string \| Texto a sintetizar \|
	\| `voice` \| string \| Nome de voz salva (padrão: voz ativa) \|
	\| `language` \| string \| Código do idioma (padrão: `pt`) \|
	\| `speaker_wav_b64` \| string \| Áudio WAV em base64 (alternativa ao `voice`) \|

	## GET `/v1/voices`

	Retorna lista de vozes salvas e qual está ativa.

	```bash
	curl https://huggingface.co/proxy/mathiasvinicius-ava-tts.hf.space/v1/voices \\
	-H "Authorization: Bearer $HF_TOKEN"
	```
	""")


	# ── OpenAI-compatible API ─────────────────────────────────────────────────────
	class SpeechRequest(BaseModel):
	model: Optional[str] = "xtts-v2"
	input: str
	voice: Optional[str] = None # None → usa voz ativa
	language: Optional[str] = "pt"
	speaker_wav_b64: Optional[str] = None


	def api_list_voices():
	return {"voices": list_voices(), "active": get_active_voice()}


	def api_speech(req: SpeechRequest):
	try:
	if not req.input or not req.input.strip():
	return JSONResponse(status_code=400, content={"error": "input vazio"})

	cleanup = False
	if req.speaker_wav_b64:
	import base64
	raw = base64.b64decode(req.speaker_wav_b64)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	tmp.write(raw)
	speaker_wav = tmp.name
	cleanup = True
	else:
	voice_name = req.voice or get_active_voice()
	if not voice_name:
	return JSONResponse(status_code=404, content={"error": "Nenhuma voz ativa. Defina uma voz na UI ou passe 'voice' no request."})
	speaker_wav = get_voice_path(voice_name)
	if not speaker_wav:
	return JSONResponse(status_code=404, content={"error": f"Voz '{voice_name}' não encontrada. Use GET /v1/voices."})

	try:
	audio_bytes = synthesize(req.input.strip(), speaker_wav, req.language or "pt")
	finally:
	if cleanup:
	os.unlink(speaker_wav)

	return StreamingResponse(
	io.BytesIO(audio_bytes),
	media_type="audio/mpeg",
	headers={"Content-Disposition": f'attachment; filename="speech-{uuid.uuid4().hex[:8]}.mp3"'},
	)
	except Exception:
	return JSONResponse(status_code=500, content={"error": traceback.format_exc()})


	app, _, _ = demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	prevent_thread_lock=True,
	ssr_mode=False,
	theme=gr.themes.Soft(),
	)

	app.add_api_route("/v1/voices", api_list_voices, methods=["GET"])
	app.add_api_route("/v1/audio/speech", api_speech, methods=["POST"])

	demo.block_thread()