Spaces:

mistralai
/

voxtral-tts-demo

Running

App Files Files Community

pandora-s commited on 16 days ago

Commit

478642e

verified ·

1 Parent(s): b253688

Upload 3 files

Browse files

Files changed (3) hide show

app.py +120 -0
styles.css +117 -0
tts.py +196 -0

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import gradio as gr
+import asyncio
+import time
+import os
+from tts import synthesize_and_play_audio
+from pydub import AudioSegment
+import shutil
+async def generate_tts(input_text, reference_audio_path, output_path="cloned.wav"):
+    print(reference_audio_path)
+    await synthesize_and_play_audio(
+        input_text=input_text,
+        reference_audio_path=reference_audio_path,
+        model="voxtral-mini-tts-260213",
+        api_key="",
+        output_path=output_path,
+        no_play=True
+    )
+    return output_path
+def gradio_tts(input_text, audio_choice, uploaded_audio=None):
+    # If an audio file is uploaded, save it to a fixed path
+    if uploaded_audio is not None:
+        reference_audio = "uploadedreference.wav"
+        shutil.copy(uploaded_audio, reference_audio)
+    else:
+        reference_audio = audio_choice
+    output_path = "cloned.wav"
+    try:
+        generated_audio = asyncio.run(generate_tts(input_text, reference_audio, output_path))
+        return generated_audio
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
+with open("styles.css", "r") as f:
+    css = f.read()
+examples = [
+    ["Frontier AI in your hands.", "sample.mp3"],
+    ["Hello, world!", "voice.wav"],
+    ["This is a test.", "4languages.mp3"],
+]
+with gr.Blocks() as demo:
+    gr.Markdown("## Voxtral TTS Demo", elem_classes="markdown")
+    gr.Markdown("Voxtral TTS is a text-to-speech model that can clone voice from any reference audio. Learn more about [Voxtral TTS](https://huggingface.co/mistralai/Voxtral-3B-TTS-2603).", elem_classes="markdown")
+    with gr.Tabs():
+        with gr.TabItem("Predefined Voices"):
+            gr.Markdown("# Predefined Voices TTS", elem_classes="markdown")
+            gr.Markdown("Enter text to synthesize and select a predefined voice.", elem_classes="markdown")
+            with gr.Row():
+                with gr.Column(elem_classes="gradio-box"):
+                    input_text_predefined = gr.Textbox(
+                        label="Enter text to synthesize",
+                        placeholder="Frontier AI in your hands.",
+                        elem_classes="gradio-textbox"
+                    )
+                    audio_choice = gr.Dropdown(
+                        label="Select a predefined voice",
+                        choices=["sample.mp3", "voice.wav", "4languages.mp3"],
+                        value="4languages.mp3",
+                    )
+                    submit_btn_predefined = gr.Button("Generate Audio", elem_classes="gradio-button")
+                with gr.Column(elem_classes="gradio-box"):
+                    output_audio_predefined = gr.Audio(label="Generated audio", elem_classes="gradio-audio")
+            submit_btn_predefined.click(
+                fn=lambda text, choice: gradio_tts(text, choice, None),
+                inputs=[input_text_predefined, audio_choice],
+                outputs=[output_audio_predefined],
+            )
+            gr.Examples(
+                examples=examples,
+                inputs=[input_text_predefined, audio_choice],
+                outputs=[output_audio_predefined],
+                fn=lambda text, choice: gradio_tts(text, choice, None),
+                cache_examples=True,
+            )
+        with gr.TabItem("Voice Cloning"):
+            gr.Markdown("# Voice Cloning TTS", elem_classes="markdown")
+            gr.Markdown("Enter text to synthesize and upload your reference audio.", elem_classes="markdown")
+            with gr.Row():
+                with gr.Column(elem_classes="gradio-box"):
+                    input_text_cloning = gr.Textbox(
+                        label="Enter text to synthesize",
+                        placeholder="Frontier AI in your hands.",
+                        elem_classes="gradio-textbox"
+                    )
+                    uploaded_audio = gr.Audio(
+                        label="Upload your reference audio",
+                        type="filepath",
+                        sources=["upload"],
+                        elem_classes="gradio-audio"
+                    )
+                    submit_btn_cloning = gr.Button("Generate Audio", elem_classes="gradio-button")
+                with gr.Column(elem_classes="gradio-box"):
+                    output_audio_cloning = gr.Audio(label="Generated audio", elem_classes="gradio-audio")
+            submit_btn_cloning.click(
+                fn=lambda text, audio: gradio_tts(text, None, audio),
+                inputs=[input_text_cloning, uploaded_audio],
+                outputs=[output_audio_cloning],
+            )
+            gr.Examples(
+                examples=[
+                    ["Frontier AI in your hands.", "sample.mp3"],
+                    ["Hello, world!", "voice.wav"],
+                ],
+                inputs=[input_text_cloning, uploaded_audio],
+                outputs=[output_audio_cloning],
+                fn=lambda text, audio: gradio_tts(text, None, audio),
+                cache_examples=True,
+            )
+if __name__ == "__main__":
+    demo.launch(share=False, css=css)

styles.css ADDED Viewed

	@@ -0,0 +1,117 @@

+@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600&family=Inter:wght@400;500;600;700&display=swap');
+/* Light mode (default) */
+:root {
+    --bg-color: #FFFAEB;
+    --bg-grid-color: #E9E2CB;
+    --card-bg-color: #FFFAEB;
+    --card-border-color: #E9E2CB;
+    --text-color: #1E1E1E;
+    --secondary-text-color: #444444;
+    --accent-color: #FF8205;
+    --box-shadow-color: rgba(0, 0, 0, 0.08);
+    --grid-opacity: 0.05;
+}
+/* Dark mode (activated by adding `dark` class to body or .gradio-container) */
+.dark {
+    --bg-color: #121212;
+    --bg-grid-color: rgba(255, 255, 255, 0.05);
+    --card-bg-color: #1E1E1E;
+    --card-border-color: #333333;
+    --text-color: #FFFFFF;
+    --secondary-text-color: #CCCCCC;
+    --accent-color: #FF8205;
+    --box-shadow-color: rgba(0, 0, 0, 0.3);
+    --grid-opacity: 0.02;
+}
+body, .gradio-container {
+    background-color: var(--bg-color) !important;
+    background-image:
+        linear-gradient(var(--bg-grid-color) 1px, transparent 1px),
+        linear-gradient(90deg, var(--bg-grid-color) 1px, transparent 1px) !important;
+    background-size: 40px 40px !important;
+    font-family: 'Inter', sans-serif !important;
+    color: var(--text-color) !important;
+    padding: 0 !important;
+    margin: 0 !important;
+}
+.gradio-container {
+    max-width: 100% !important;
+    padding: 2rem !important;
+}
+#transcribe_audio {
+    background: linear-gradient(135deg, var(--card-bg-color) 0%, #FFF0C3 100%) !important;
+    border: 2px solid var(--card-border-color) !important;
+    border-top: 4px solid var(--accent-color) !important;
+    padding: 2rem !important;
+    box-shadow: 0 4px 24px var(--box-shadow-color) !important;
+}
+h1 {
+    font-size: 2rem !important;
+    font-weight: 700 !important;
+    color: var(--text-color) !important;
+    margin: 0 0 0.5rem 0 !important;
+    letter-spacing: -0.02em !important;
+}
+.markdown {
+    font-family: 'Inter', sans-serif !important;
+    color: var(--secondary-text-color) !important;
+}
+.gradio-box {
+    background: var(--card-bg-color) !important;
+    border: 2px solid var(--card-border-color) !important;
+    box-shadow: 0 8px 32px var(--box-shadow-color) !important;
+    overflow: hidden !important;
+    padding: 1rem !important;
+}
+.gradio-textbox {
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 1.1rem !important;
+    line-height: 1.8 !important;
+    color: var(--text-color) !important;
+    white-space: pre-wrap !important;
+    word-break: break-word !important;
+    text-align: left !important;
+    min-height: 200px !important;
+    background-color: var(--card-bg-color) !important;
+    background-image:
+        linear-gradient(rgba(0, 0, 0, var(--grid-opacity)) 1px, transparent 1px),
+        linear-gradient(90deg, rgba(0, 0, 0, var(--grid-opacity)) 1px, transparent 1px) !important;
+    background-size: 20px 20px !important;
+    padding: 1rem !important;
+    border: 1px solid var(--card-border-color) !important;
+}
+.gradio-button {
+    background: var(--accent-color) !important;
+    color: white !important;
+    border: none !important;
+    padding: 0.75rem 1.5rem !important;
+    font-weight: 600 !important;
+    text-transform: uppercase !important;
+    letter-spacing: 0.05em !important;
+    font-size: 0.85rem !important;
+    cursor: pointer !important;
+    transition: all 0.2s !important;
+}
+.gradio-button:hover {
+    background: #E67200 !important;
+}
+.gradio-audio {
+    margin: 0rem !important;
+    min-height: 250px !important;
+}
+footer {
+    display: none !important;
+}

tts.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from __future__ import annotations
+import base64
+import io
+import pathlib
+import shutil
+import struct
+import sys
+import wave
+import aiofiles
+import aiohttp
+from typing import Optional
+import asyncio
+async def synthesize_and_play_audio(
+    input_text: str = "Hello!",
+    reference_audio_path: str = "~/Downloads/jmsample.wav",
+    model: str = "voxtral-mini-tts-260213",
+    api_key: str = "MISTRAL_API_KEY",
+    output_path: str = "/tmp/voxtral.wav",
+    timeout: float = 180.0,
+    raw_sample_rate: int = 24000,
+    raw_channels: int = 1,
+    no_play: bool = False,
+    url: str = "https://api.mistral.ai/v1/audio/text-to-speech",
+    reference_format: str = "raw-base64",
+) -> int:
+    """
+    Asynchronously synthesize speech from input text using a reference audio file and play the output.
+    Args:
+        input_text: Text to synthesize.
+        reference_audio_path: Path to the reference WAV file.
+        model: Model name sent in the request.
+        api_key: API key for authentication.
+        output_path: Output audio file path.
+        timeout: HTTP timeout in seconds.
+        raw_sample_rate: Sample rate used when response audio is raw f32le (non-WAV).
+        raw_channels: Channel count used when response audio is raw f32le (non-WAV).
+        no_play: If True, only save audio, do not launch a player.
+        url: TTS endpoint URL.
+        reference_format: How to serialize reference_audio ("data-uri" or "raw-base64").
+    Returns:
+        int: 0 on success, non-zero on failure.
+    """
+    print(f"Synthesizing: {input_text!r}")
+    try:
+        # Use async file operations
+        reference_path = pathlib.Path(reference_audio_path).expanduser().resolve()
+        if not reference_path.is_file():
+            print(f"Reference audio not found: {reference_path}", file=sys.stderr)
+            return 2
+        # Read reference audio asynchronously
+        async with aiofiles.open(reference_path, 'rb') as f:
+            reference_bytes = await f.read()
+        reference_b64 = base64.b64encode(reference_bytes).decode("ascii")
+        if reference_format == "data-uri":
+            reference_audio = f"data:audio/wav;base64,{reference_b64}"
+        else:
+            reference_audio = reference_b64
+        payload = {
+            "model": model,
+            "input": input_text,
+            "reference_audio": reference_audio,
+            "response_format": "wav",
+        }
+        headers = {
+            "content-type": "application/json",
+            "x-api-key": api_key,
+        }
+        # Use async HTTP client
+        async with aiohttp.ClientSession() as session:
+            try:
+                async with session.post(url, headers=headers, json=payload, timeout=timeout) as response:
+                    if response.status >= 400:
+                        error_text = await response.text()
+                        print(f"Request failed: {response.status}", file=sys.stderr)
+                        print(error_text, file=sys.stderr)
+                        return 1
+                    content_type = response.headers.get("content-type", "")
+                    audio_bytes = await response.read()
+                    if "application/json" in content_type.lower():
+                        body = await response.json()
+                        audio_field = body.get("audio") if isinstance(body, dict) else None
+                        if not isinstance(audio_field, str) or not audio_field:
+                            print("JSON response does not contain an 'audio' field.", file=sys.stderr)
+                            print(body, file=sys.stderr)
+                            return 1
+                        if "," in audio_field and audio_field.startswith("data:"):
+                            audio_field = audio_field.split(",", 1)[1]
+                        try:
+                            audio_bytes = base64.b64decode(audio_field, validate=False)
+                        except Exception as exc:
+                            print(f"Failed to decode JSON audio field as base64: {exc}", file=sys.stderr)
+                            return 1
+                    if not _looks_like_wav(audio_bytes):
+                        converted = _convert_f32le_to_wav(
+                            audio_bytes,
+                            sample_rate=raw_sample_rate,
+                            channels=raw_channels,
+                        )
+                        if converted is not None:
+                            audio_bytes = converted
+                            print(
+                                "Response was non-WAV raw audio; converted f32le stream to WAV.",
+                                file=sys.stderr,
+                            )
+                        else:
+                            print(
+                                "Response bytes are non-WAV and could not be auto-converted.",
+                                file=sys.stderr,
+                            )
+                    # Write output file asynchronously
+                    output_path_obj = pathlib.Path(output_path).expanduser()
+                    async with aiofiles.open(output_path_obj, 'wb') as f:
+                        await f.write(audio_bytes)
+                    print(f"Wrote {len(audio_bytes)} bytes to {output_path_obj}")
+                    if not no_play:
+                        await maybe_play_audio_async(output_path_obj)
+                    return 0
+            except asyncio.TimeoutError:
+                print(f"Request timed out after {timeout} seconds", file=sys.stderr)
+                return 1
+            except aiohttp.ClientError as e:
+                print(f"HTTP client error: {e}", file=sys.stderr)
+                return 1
+    except Exception as e:
+        print(f"Unexpected error: {e}", file=sys.stderr)
+        return 1
+def _looks_like_wav(audio_bytes: bytes) -> bool:
+    return len(audio_bytes) >= 12 and audio_bytes[:4] == b"RIFF" and audio_bytes[8:12] == b"WAVE"
+def _convert_f32le_to_wav(audio_bytes: bytes, sample_rate: int, channels: int) -> Optional[bytes]:
+    if len(audio_bytes) % 4 != 0:
+        return None
+    if channels <= 0 or sample_rate <= 0:
+        return None
+    pcm16 = bytearray()
+    for (sample,) in struct.iter_unpack("<f", audio_bytes):
+        clipped = max(-1.0, min(1.0, sample))
+        pcm16.extend(struct.pack("<h", int(clipped * 32767)))
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(bytes(pcm16))
+    return buffer.getvalue()
+async def maybe_play_audio_async(path: pathlib.Path) -> None:
+    """Asynchronously play audio using available players."""
+    players = (
+        ("afplay", [str(path)]),
+        ("ffplay", ["-nodisp", "-autoexit", str(path)]),
+        ("mpv", [str(path)]),
+    )
+    for command, extra_args in players:
+        if shutil.which(command) is None:
+            continue
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                command, *extra_args,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            await proc.wait()
+            if proc.returncode == 0:
+                return
+            print(
+                f"Player '{command}' failed with exit code {proc.returncode}, trying next player.",
+                file=sys.stderr,
+            )
+        except Exception as e:
+            print(f"Error with player '{command}': {e}", file=sys.stderr)
+            continue
+    print("No local audio player found (afplay/ffplay/mpv).", file=sys.stderr)