Spaces:
Paused
Paused
| import sys | |
| import types | |
| # ββ ZeroGPU startup patch βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # torchaudio compilado contra CUDA 13 trava ao importar quando nΓ£o hΓ‘ GPU. | |
| # Injetamos um stub para torchaudio._extension em sys.modules ANTES de qualquer | |
| # import do TTS/torchaudio. O Python usa o stub sem executar o __init__.py real, | |
| # evitando toda a lΓ³gica de carregamento de extensΓ£o CUDA. | |
| # O Γ‘udio cai automaticamente para o backend soundfile (puro Python, sem CUDA). | |
| def _make_torchaudio_ext_stub(name): | |
| """Stub para torchaudio._extension β captura qualquer atributo faltante. | |
| Problema raiz: torchaudio compilado contra CUDA 13 trava no ZeroGPU startup. | |
| Injetamos este stub em sys.modules antes de qualquer import de torchaudio. | |
| Cuidados: | |
| - __file__ = None β inspect.getfile() levanta TypeError (capturado por getmodule), | |
| evitando que __getattr__ retorne uma lambda onde se espera uma string de path. | |
| - __getattr__ levanta AttributeError para dunders β Python os trata normalmente. | |
| - Flags booleanas (_IS_*, _HAS_*) β False. | |
| - Qualquer outro atributo β callable no-op (ex: fail_if_no_align). | |
| """ | |
| mod = types.ModuleType(name) | |
| # Dunders de mΓ³dulo explΓcitos para nΓ£o cair no __getattr__ | |
| mod.__file__ = None | |
| mod.__spec__ = None | |
| mod.__loader__ = None | |
| mod.__package__ = name.rsplit(".", 1)[0] if "." in name else name | |
| # Atributos conhecidos da extensΓ£o | |
| mod._IS_TORCHAUDIO_EXT_AVAILABLE = False | |
| mod._IS_SOX_AVAILABLE = False | |
| mod._IS_FFMPEG_AVAILABLE = False | |
| mod._load_lib = lambda *a, **kw: False | |
| mod._check_cuda_version = lambda: None | |
| mod._init_ffmpeg = lambda: None | |
| def __getattr__(attr): | |
| # Nunca interceptar dunders β deixar Python tratar AttributeError normalmente | |
| if attr.startswith("__") and attr.endswith("__"): | |
| raise AttributeError(f"module {name!r} has no attribute {attr!r}") | |
| # Flags booleanas β False | |
| if any(attr.startswith(p) for p in ("_IS_", "_HAS_", "is_", "has_")): | |
| return False | |
| # Qualquer outro atributo faltante β callable no-op | |
| return lambda *a, **kw: None | |
| mod.__getattr__ = __getattr__ | |
| return mod | |
| sys.modules["torchaudio._extension"] = _make_torchaudio_ext_stub("torchaudio._extension") | |
| sys.modules["torchaudio._extension.utils"] = _make_torchaudio_ext_stub("torchaudio._extension.utils") | |
| # Coqui TTS 0.22 / xtts.py importa stream_generator que por sua vez importa | |
| # BeamSearchScorer β removido completamente do transformers>=4.50 (nem no mΓ³dulo | |
| # interno existe mais). Para inferΓͺncia normal (nΓ£o-streaming) sΓ³ precisamos de | |
| # init_stream_support como no-op. Injetamos um stub completo do mΓ³dulo. | |
| def _make_stub_module(name, pkg, **attrs): | |
| m = types.ModuleType(name) | |
| m.__file__ = None | |
| m.__spec__ = None | |
| m.__loader__ = None | |
| m.__package__ = pkg | |
| for k, v in attrs.items(): | |
| setattr(m, k, v) | |
| def __getattr__(attr): | |
| if attr.startswith("__") and attr.endswith("__"): | |
| raise AttributeError(attr) | |
| return lambda *a, **kw: None | |
| m.__getattr__ = __getattr__ | |
| return m | |
| sys.modules["TTS.tts.layers.xtts.stream_generator"] = _make_stub_module( | |
| "TTS.tts.layers.xtts.stream_generator", | |
| "TTS.tts.layers.xtts", | |
| init_stream_support=lambda: None, | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| import warnings | |
| import sys | |
| # Suprime avisos de compatibilidade futura do transformers (GPT2InferenceModel/GenerationMixin) | |
| warnings.filterwarnings("ignore", message=".*GenerationMixin.*") | |
| warnings.filterwarnings("ignore", message=".*prepare_inputs_for_generation.*") | |
| # "Invalid file descriptor: -1" vem do asyncio __del__ durante GC β bypassa o | |
| # sistema de warnings e vai para sys.unraisablehook. Filtramos especificamente. | |
| _orig_unraisablehook = sys.unraisablehook | |
| def _quiet_unraisablehook(unraisable): | |
| if isinstance(unraisable.exc_value, ValueError) and \ | |
| "Invalid file descriptor" in str(unraisable.exc_value): | |
| return | |
| _orig_unraisablehook(unraisable) | |
| sys.unraisablehook = _quiet_unraisablehook | |
| import gradio as gr | |
| import spaces | |
| import traceback | |
| import uuid | |
| import io | |
| import os | |
| import json | |
| import shutil | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Optional | |
| from huggingface_hub import snapshot_download | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| from TTS.tts.models.xtts import Xtts | |
| from pydub import AudioSegment | |
| import soundfile as sf | |
| import numpy as np | |
| from fastapi.responses import StreamingResponse, JSONResponse | |
| from pydantic import BaseModel | |
| # ββ ConfiguraΓ§Γ£o ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_HF_REPO = "tts-hub/XTTS-v2" | |
| MODEL_CACHE_DIR = Path("/tmp/xtts_v2") | |
| VOICES_DIR = Path("voices") | |
| VOICES_DIR.mkdir(exist_ok=True) | |
| CONFIG_FILE = VOICES_DIR / "config.json" | |
| SUPPORTED_FORMATS = [".wav", ".mp3", ".ogg", ".flac", ".m4a", ".aac", ".opus", ".wma", ".webm"] | |
| LANGUAGES = [ | |
| ("PortuguΓͺs (BR)", "pt"), ("English", "en"), ("EspaΓ±ol", "es"), | |
| ("FranΓ§ais", "fr"), ("Deutsch", "de"), ("Italiano", "it"), | |
| ("Polski", "pl"), ("TΓΌrkΓ§e", "tr"), ("Π ΡΡΡΠΊΠΈΠΉ", "ru"), | |
| ("Nederlands", "nl"), ("ΔeΕ‘tina", "cs"), ("Ψ§ΩΨΉΨ±Ψ¨ΩΨ©", "ar"), | |
| ("δΈζ", "zh-cn"), ("Magyar", "hu"), ("νκ΅μ΄", "ko"), ("ζ₯ζ¬θͺ", "ja"), | |
| ] | |
| LANG_CHOICES = [f"{name} ({code})" for name, code in LANGUAGES] | |
| LANG_MAP = {f"{name} ({code})": code for name, code in LANGUAGES} | |
| def _load_config() -> dict: | |
| if CONFIG_FILE.exists(): | |
| try: | |
| return json.loads(CONFIG_FILE.read_text()) | |
| except Exception: | |
| pass | |
| return {"active_voice": None} | |
| def _save_config(cfg: dict): | |
| CONFIG_FILE.write_text(json.dumps(cfg, indent=2)) | |
| # ββ Carrega modelo XTTS-v2 do HuggingFace Hub ββββββββββββββββββββββββββββββββ | |
| # Evita os servidores mortos do Coqui. snapshot_download baixa para cache local | |
| # e Γ© idempotente (no-op se jΓ‘ existe). CPU ao iniciar β GPU sΓ³ dentro de @spaces.GPU. | |
| print("Baixando/verificando modelo XTTS-v2 do HF Hub...") | |
| _hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") | |
| _model_dir = snapshot_download( | |
| repo_id=MODEL_HF_REPO, | |
| local_dir=str(MODEL_CACHE_DIR), | |
| token=_hf_token, | |
| ) | |
| _xtts_config = XttsConfig() | |
| _xtts_config.load_json(str(MODEL_CACHE_DIR / "config.json")) | |
| # PyTorch 2.6+ mudou weights_only=True por padrΓ£o, quebrando checkpoints TTS | |
| # que serializam objetos Python arbitrΓ‘rios. Patcheamos torch.load para usar | |
| # weights_only=False quando nΓ£o especificado β seguro pois o modelo vem do HF Hub. | |
| import torch as _torch | |
| _orig_torch_load = _torch.load | |
| def _torch_load_compat(*args, **kwargs): | |
| kwargs.setdefault("weights_only", False) | |
| return _orig_torch_load(*args, **kwargs) | |
| _torch.load = _torch_load_compat | |
| model = Xtts.init_from_config(_xtts_config) | |
| model.load_checkpoint(_xtts_config, checkpoint_dir=str(MODEL_CACHE_DIR), eval=True) | |
| model.cpu() | |
| print("Modelo XTTS-v2 carregado na CPU.") | |
| # Em transformers>=4.50, PreTrainedModel nΓ£o herda mais de GenerationMixin. | |
| # GPT2InferenceModel (dentro do XTTS) perde .generate() β AttributeError. | |
| # Corrigimos adicionando GenerationMixin explicitamente Γ cadeia de heranΓ§a. | |
| from transformers import GenerationMixin as _GenMixin, GenerationConfig as _GenConfig | |
| from TTS.tts.layers.xtts.gpt import GPT2InferenceModel as _GPT2Inf, GPT as _GPT | |
| if not issubclass(_GPT2Inf, _GenMixin): | |
| _GPT2Inf.__bases__ = (_GenMixin,) + _GPT2Inf.__bases__ | |
| print("Patch GenerationMixin aplicado ao GPT2InferenceModel.") | |
| # gpt.py passa eos/bos/pad_token_id como kwargs diretos para gpt_inference.generate(). | |
| # Em transformers>=4.45 esses kwargs sΓ£o deprecated e IGNORADOS β o modelo roda atΓ© | |
| # max_length e gera ruΓdo. A correΓ§Γ£o tem duas partes: | |
| # | |
| # 1. Injetar os token IDs no generation_config do gpt_inference (fonte oficial). | |
| # 2. Patchear GPT.generate para remover os kwargs antes de passΓ‘-los, evitando conflito. | |
| _stop = int(model.gpt.stop_audio_token) | |
| _start = int(model.gpt.start_audio_token) | |
| model.gpt.gpt_inference.generation_config = _GenConfig( | |
| eos_token_id=_stop, | |
| bos_token_id=_start, | |
| pad_token_id=_stop, | |
| ) | |
| print(f"generation_config injetado: eos={_stop} bos={_start}.") | |
| _orig_gpt_cls_generate = _GPT.generate | |
| def _patched_gpt_cls_generate(self, cond_latents, text_inputs, **hf_generate_kwargs): | |
| # Remove kwargs de token deprecados β vivem agora em generation_config | |
| for _k in ("eos_token_id", "bos_token_id", "pad_token_id"): | |
| hf_generate_kwargs.pop(_k, None) | |
| return _orig_gpt_cls_generate(self, cond_latents, text_inputs, **hf_generate_kwargs) | |
| _GPT.generate = _patched_gpt_cls_generate | |
| print("Patch GPT.generate aplicado: kwargs de token removidos (vivem em generation_config).") | |
| # Patch load_audio do XTTS para usar soundfile em vez de torchaudio. | |
| # torchaudio 2.9+ usa torchcodec como backend padrΓ£o (nΓ£o instalado). | |
| # Nossos Γ‘udios de referΓͺncia jΓ‘ sΓ£o WAV 22050Hz mono (via convert_to_wav). | |
| import torch as _torch2 | |
| import soundfile as _sf | |
| import numpy as _np | |
| def _load_audio_soundfile(audiopath, sampling_rate=22050): | |
| data, sr = _sf.read(str(audiopath), dtype="float32") | |
| if data.ndim > 1: | |
| data = data.mean(axis=1) # stereo β mono | |
| if sr != sampling_rate: # resample se necessΓ‘rio | |
| from scipy import signal as _sig | |
| n = int(len(data) * sampling_rate / sr) | |
| data = _sig.resample(data, n) | |
| # load_audio do XTTS retorna sΓ³ o tensor (nΓ£o tupla) β compatΓvel com xtts.py:358 | |
| return _torch2.FloatTensor(data).unsqueeze(0) | |
| import TTS.tts.models.xtts as _xtts_mod | |
| _xtts_mod.load_audio = _load_audio_soundfile | |
| # ββ ConversΓ£o de Γ‘udio ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def convert_to_wav(input_path: str) -> str: | |
| """Converte qualquer formato suportado para WAV 22050Hz mono.""" | |
| src = Path(input_path) | |
| ext = src.suffix.lower() | |
| fmt_map = { | |
| ".mp3": "mp3", ".ogg": "ogg", ".flac": "flac", | |
| ".m4a": "m4a", ".aac": "aac", ".opus": "ogg", | |
| ".wma": "asf", ".webm": "webm", ".wav": "wav", | |
| } | |
| fmt = fmt_map.get(ext, "wav") | |
| audio = AudioSegment.from_file(str(src), format=fmt) | |
| audio = audio.set_channels(1).set_frame_rate(22050) | |
| out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| audio.export(out.name, format="wav") | |
| return out.name | |
| # ββ NΓΊcleo de geraΓ§Γ£o (GPU exclusiva via ZeroGPU) βββββββββββββββββββββββββββββ | |
| def synthesize(text: str, speaker_wav: str, language: str = "pt") -> bytes: | |
| import traceback as _tb | |
| model.cuda() | |
| try: | |
| gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( | |
| audio_path=[speaker_wav], | |
| gpt_cond_len=30, | |
| max_ref_length=60, | |
| ) | |
| out = model.inference( | |
| text=text, | |
| language=language, | |
| gpt_cond_latent=gpt_cond_latent, | |
| speaker_embedding=speaker_embedding, | |
| temperature=0.7, | |
| ) | |
| wav = np.array(out["wav"], dtype=np.float32) | |
| # Normaliza e converte para PCM 16-bit | |
| max_val = np.abs(wav).max() | |
| if max_val > 0: | |
| wav = wav / max_val | |
| wav_int16 = (wav * 32767).astype(np.int16) | |
| # WAV intermediΓ‘rio β MP3 via pydub | |
| wav_buf = io.BytesIO() | |
| sf.write(wav_buf, wav_int16, 24000, format="WAV", subtype="PCM_16") | |
| wav_buf.seek(0) | |
| segment = AudioSegment.from_wav(wav_buf) | |
| mp3_buf = io.BytesIO() | |
| segment.export(mp3_buf, format="mp3", bitrate="128k") | |
| return mp3_buf.getvalue() | |
| except Exception: | |
| print("=== SYNTHESIZE ERROR ===", flush=True) | |
| print(_tb.format_exc(), flush=True) | |
| raise | |
| finally: | |
| model.cpu() | |
| # ββ Helpers de voz ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def list_voices() -> list[str]: | |
| return sorted(p.stem for p in VOICES_DIR.glob("*.wav")) | |
| def get_voice_path(voice_name: str) -> Optional[str]: | |
| p = VOICES_DIR / f"{voice_name}.wav" | |
| return str(p) if p.exists() else None | |
| def get_active_voice() -> Optional[str]: | |
| cfg = _load_config() | |
| av = cfg.get("active_voice") | |
| # valida que o arquivo ainda existe | |
| if av and (VOICES_DIR / f"{av}.wav").exists(): | |
| return av | |
| return None | |
| def set_active_voice(voice_name: str) -> str: | |
| cfg = _load_config() | |
| cfg["active_voice"] = voice_name | |
| _save_config(cfg) | |
| return voice_name | |
| # ββ FunΓ§Γ΅es da UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def ui_generate(text, voice_sel, ref_audio, lang_sel): | |
| try: | |
| if not text or not text.strip(): | |
| return None, "Texto vazio." | |
| language = LANG_MAP.get(lang_sel, "pt") | |
| if ref_audio: | |
| wav_path = convert_to_wav(ref_audio) | |
| cleanup = True | |
| label = "referΓͺncia enviada" | |
| elif voice_sel: | |
| wav_path = get_voice_path(voice_sel) | |
| cleanup = False | |
| label = voice_sel | |
| else: | |
| active = get_active_voice() | |
| if active: | |
| wav_path = get_voice_path(active) | |
| cleanup = False | |
| label = f"{active} (ativa)" | |
| else: | |
| return None, "Nenhuma voz disponΓvel. Salve uma voz na aba Vozes ou envie uma referΓͺncia." | |
| if not wav_path: | |
| return None, f"Arquivo de voz nΓ£o encontrado." | |
| try: | |
| audio_bytes = synthesize(text.strip(), wav_path, language) | |
| finally: | |
| if cleanup: | |
| os.unlink(wav_path) | |
| size_kb = len(audio_bytes) // 1024 | |
| return audio_bytes, f"Gerado com '{label}' β {size_kb} KB" | |
| except Exception: | |
| return None, traceback.format_exc() | |
| def ui_save_voice(name, audio_file): | |
| try: | |
| if not name or not name.strip(): | |
| return "Nome da voz Γ© obrigatΓ³rio.", gr.update(), gr.update(), gr.update() | |
| if not audio_file: | |
| return "Envie um arquivo de Γ‘udio.", gr.update(), gr.update(), gr.update() | |
| name = name.strip().lower().replace(" ", "_") | |
| src = Path(audio_file) | |
| if src.suffix.lower() not in SUPPORTED_FORMATS: | |
| return f"Formato '{src.suffix}' nΓ£o suportado. Use: {', '.join(SUPPORTED_FORMATS)}", gr.update(), gr.update(), gr.update() | |
| wav_tmp = convert_to_wav(audio_file) | |
| dest = VOICES_DIR / f"{name}.wav" | |
| shutil.move(wav_tmp, dest) | |
| voices = list_voices() | |
| active = get_active_voice() | |
| msg = f"Voz '{name}' salva." | |
| if not active: | |
| set_active_voice(name) | |
| active = name | |
| msg += f" Definida como voz ativa." | |
| return ( | |
| msg, | |
| gr.update(choices=voices, value=name), # gerenciar dropdown | |
| gr.update(choices=voices, value=name), # gerar dropdown | |
| gr.update(choices=voices, value=active), # ativa dropdown | |
| ) | |
| except Exception: | |
| return traceback.format_exc(), gr.update(), gr.update(), gr.update() | |
| def ui_delete_voice(voice_name): | |
| try: | |
| if not voice_name: | |
| return "Selecione uma voz.", gr.update(), gr.update(), gr.update() | |
| p = VOICES_DIR / f"{voice_name}.wav" | |
| if p.exists(): | |
| p.unlink() | |
| cfg = _load_config() | |
| if cfg.get("active_voice") == voice_name: | |
| voices = list_voices() | |
| cfg["active_voice"] = voices[0] if voices else None | |
| _save_config(cfg) | |
| voices = list_voices() | |
| active = get_active_voice() | |
| return ( | |
| f"Voz '{voice_name}' excluΓda.", | |
| gr.update(choices=voices, value=voices[0] if voices else None), | |
| gr.update(choices=voices, value=voices[0] if voices else None), | |
| gr.update(choices=voices, value=active), | |
| ) | |
| except Exception: | |
| return traceback.format_exc(), gr.update(), gr.update(), gr.update() | |
| def ui_set_active(voice_name): | |
| if not voice_name: | |
| return "Selecione uma voz.", gr.update() | |
| set_active_voice(voice_name) | |
| return f"Voz ativa: '{voice_name}'", gr.update(value=voice_name) | |
| def ui_preview_voice(voice_name): | |
| if not voice_name: | |
| return None | |
| p = get_voice_path(voice_name) | |
| return p if p else None | |
| def ui_refresh_all(): | |
| voices = list_voices() | |
| active = get_active_voice() | |
| return ( | |
| gr.update(choices=voices, value=voices[0] if voices else None), | |
| gr.update(choices=voices, value=voices[0] if voices else None), | |
| gr.update(choices=voices, value=active), | |
| ) | |
| # ββ Interface Gradio ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _voices_init = list_voices() | |
| _active_init = get_active_voice() | |
| with gr.Blocks(title="AVA TTS") as demo: | |
| gr.Markdown("# AVA TTS\nSΓntese de voz com clone via XTTS-v2 β’ API em `/v1/audio/speech`") | |
| # Estado compartilhado entre abas | |
| voice_gen_dd = gr.State(None) | |
| # ββ Aba: Gerar ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("Gerar"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| txt_input = gr.Textbox( | |
| label="Texto", | |
| lines=5, | |
| placeholder="Digite o texto a sintetizar...", | |
| ) | |
| lang_dd = gr.Dropdown( | |
| choices=LANG_CHOICES, | |
| value="PortuguΓͺs (BR) (pt)", | |
| label="Idioma", | |
| ) | |
| with gr.Row(): | |
| voice_dd_gen = gr.Dropdown( | |
| choices=_voices_init, | |
| value=_active_init, | |
| label="Voz (deixe vazio para usar a voz ativa)", | |
| ) | |
| active_badge = gr.Textbox( | |
| value=f"Ativa: {_active_init or 'β'}", | |
| label="", | |
| interactive=False, | |
| scale=1, | |
| ) | |
| ref_audio_gen = gr.Audio( | |
| label="ReferΓͺncia ad-hoc (sobrepΓ΅e voz selecionada)", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| ) | |
| gen_btn = gr.Button("Sintetizar", variant="primary") | |
| with gr.Column(scale=1): | |
| audio_out = gr.Audio(label="Resultado", type="numpy") | |
| gen_status = gr.Textbox(label="Status", interactive=False, lines=3) | |
| gen_btn.click( | |
| ui_generate, | |
| [txt_input, voice_dd_gen, ref_audio_gen, lang_dd], | |
| [audio_out, gen_status], | |
| ) | |
| # ββ Aba: Vozes ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("Vozes"): | |
| gr.Markdown( | |
| "Salve amostras de voz (3β10s de Γ‘udio limpo). " | |
| f"Formatos aceitos: **{', '.join(SUPPORTED_FORMATS)}** β convertidos automaticamente para WAV." | |
| ) | |
| with gr.Row(): | |
| # Coluna: adicionar voz | |
| with gr.Column(): | |
| gr.Markdown("### Adicionar voz") | |
| new_name = gr.Textbox(label="Nome", placeholder="ex: ava, narrador, lucas") | |
| new_audio = gr.File( | |
| label="Arquivo de Γ‘udio", | |
| file_types=SUPPORTED_FORMATS, | |
| ) | |
| save_btn = gr.Button("Salvar voz", variant="primary") | |
| save_status = gr.Textbox(label="Status", interactive=False, lines=2) | |
| # Coluna: gerenciar vozes existentes | |
| with gr.Column(): | |
| gr.Markdown("### Gerenciar vozes") | |
| with gr.Row(): | |
| active_dd = gr.Dropdown( | |
| choices=_voices_init, | |
| value=_active_init, | |
| label="Voz ativa (usada como padrΓ£o na API)", | |
| ) | |
| set_active_btn = gr.Button("Definir ativa", variant="secondary") | |
| voice_mgr_dd = gr.Dropdown( | |
| choices=_voices_init, | |
| value=_voices_init[0] if _voices_init else None, | |
| label="Selecionar voz", | |
| ) | |
| preview_audio = gr.Audio(label="Preview da referΓͺncia", type="filepath", interactive=False) | |
| with gr.Row(): | |
| refresh_btn = gr.Button("Atualizar lista") | |
| del_btn = gr.Button("Excluir selecionada", variant="stop") | |
| mgr_status = gr.Textbox(label="Status", interactive=False, lines=2) | |
| # Eventos β Vozes | |
| voice_mgr_dd.change(ui_preview_voice, [voice_mgr_dd], [preview_audio]) | |
| set_active_btn.click( | |
| ui_set_active, | |
| [active_dd], | |
| [mgr_status, active_dd], | |
| ).then( | |
| lambda v: f"Ativa: {v or 'β'}", | |
| [active_dd], | |
| [active_badge], | |
| ) | |
| save_btn.click( | |
| ui_save_voice, | |
| [new_name, new_audio], | |
| [save_status, voice_mgr_dd, voice_dd_gen, active_dd], | |
| ).then( | |
| lambda v: f"Ativa: {v or 'β'}", | |
| [active_dd], | |
| [active_badge], | |
| ) | |
| del_btn.click( | |
| ui_delete_voice, | |
| [voice_mgr_dd], | |
| [mgr_status, voice_mgr_dd, voice_dd_gen, active_dd], | |
| ).then( | |
| lambda v: f"Ativa: {v or 'β'}", | |
| [active_dd], | |
| [active_badge], | |
| ) | |
| refresh_btn.click( | |
| ui_refresh_all, | |
| [], | |
| [voice_mgr_dd, voice_dd_gen, active_dd], | |
| ).then( | |
| lambda v: f"Ativa: {v or 'β'}", | |
| [active_dd], | |
| [active_badge], | |
| ) | |
| # ββ Aba: API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("API"): | |
| gr.Markdown(""" | |
| ## POST `/v1/audio/speech` | |
| ```bash | |
| curl -X POST https://huggingface.co/proxy/mathiasvinicius-ava-tts.hf.space/v1/audio/speech \\ | |
| -H "Authorization: Bearer $HF_TOKEN" \\ | |
| -H "Content-Type: application/json" \\ | |
| -d '{"model":"xtts-v2","input":"OlΓ‘! Eu sou a AVA.","voice":"ava","language":"pt"}' \\ | |
| --output audio.wav | |
| ``` | |
| | Campo | Tipo | DescriΓ§Γ£o | | |
| |---|---|---| | |
| | `input` | string | Texto a sintetizar | | |
| | `voice` | string | Nome de voz salva (padrΓ£o: voz ativa) | | |
| | `language` | string | CΓ³digo do idioma (padrΓ£o: `pt`) | | |
| | `speaker_wav_b64` | string | Γudio WAV em base64 (alternativa ao `voice`) | | |
| ## GET `/v1/voices` | |
| Retorna lista de vozes salvas e qual estΓ‘ ativa. | |
| ```bash | |
| curl https://huggingface.co/proxy/mathiasvinicius-ava-tts.hf.space/v1/voices \\ | |
| -H "Authorization: Bearer $HF_TOKEN" | |
| ``` | |
| """) | |
| # ββ OpenAI-compatible API βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class SpeechRequest(BaseModel): | |
| model: Optional[str] = "xtts-v2" | |
| input: str | |
| voice: Optional[str] = None # None β usa voz ativa | |
| language: Optional[str] = "pt" | |
| speaker_wav_b64: Optional[str] = None | |
| def api_list_voices(): | |
| return {"voices": list_voices(), "active": get_active_voice()} | |
| def api_speech(req: SpeechRequest): | |
| try: | |
| if not req.input or not req.input.strip(): | |
| return JSONResponse(status_code=400, content={"error": "input vazio"}) | |
| cleanup = False | |
| if req.speaker_wav_b64: | |
| import base64 | |
| raw = base64.b64decode(req.speaker_wav_b64) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
| tmp.write(raw) | |
| speaker_wav = tmp.name | |
| cleanup = True | |
| else: | |
| voice_name = req.voice or get_active_voice() | |
| if not voice_name: | |
| return JSONResponse(status_code=404, content={"error": "Nenhuma voz ativa. Defina uma voz na UI ou passe 'voice' no request."}) | |
| speaker_wav = get_voice_path(voice_name) | |
| if not speaker_wav: | |
| return JSONResponse(status_code=404, content={"error": f"Voz '{voice_name}' nΓ£o encontrada. Use GET /v1/voices."}) | |
| try: | |
| audio_bytes = synthesize(req.input.strip(), speaker_wav, req.language or "pt") | |
| finally: | |
| if cleanup: | |
| os.unlink(speaker_wav) | |
| return StreamingResponse( | |
| io.BytesIO(audio_bytes), | |
| media_type="audio/mpeg", | |
| headers={"Content-Disposition": f'attachment; filename="speech-{uuid.uuid4().hex[:8]}.mp3"'}, | |
| ) | |
| except Exception: | |
| return JSONResponse(status_code=500, content={"error": traceback.format_exc()}) | |
| app, _, _ = demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| prevent_thread_lock=True, | |
| ssr_mode=False, | |
| theme=gr.themes.Soft(), | |
| ) | |
| app.add_api_route("/v1/voices", api_list_voices, methods=["GET"]) | |
| app.add_api_route("/v1/audio/speech", api_speech, methods=["POST"]) | |
| demo.block_thread() | |