pandora-s commited on
Commit
478642e
·
verified ·
1 Parent(s): b253688

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +120 -0
  2. styles.css +117 -0
  3. tts.py +196 -0
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import time
4
+ import os
5
+ from tts import synthesize_and_play_audio
6
+ from pydub import AudioSegment
7
+ import shutil
8
+
9
+ async def generate_tts(input_text, reference_audio_path, output_path="cloned.wav"):
10
+ print(reference_audio_path)
11
+ await synthesize_and_play_audio(
12
+ input_text=input_text,
13
+ reference_audio_path=reference_audio_path,
14
+ model="voxtral-mini-tts-260213",
15
+ api_key="",
16
+ output_path=output_path,
17
+ no_play=True
18
+ )
19
+ return output_path
20
+
21
+ def gradio_tts(input_text, audio_choice, uploaded_audio=None):
22
+ # If an audio file is uploaded, save it to a fixed path
23
+ if uploaded_audio is not None:
24
+ reference_audio = "uploadedreference.wav"
25
+ shutil.copy(uploaded_audio, reference_audio)
26
+ else:
27
+ reference_audio = audio_choice
28
+
29
+ output_path = "cloned.wav"
30
+ try:
31
+ generated_audio = asyncio.run(generate_tts(input_text, reference_audio, output_path))
32
+ return generated_audio
33
+ except Exception as e:
34
+ print(f"Error: {e}")
35
+ return None
36
+
37
+
38
+
39
+ with open("styles.css", "r") as f:
40
+ css = f.read()
41
+
42
+ examples = [
43
+ ["Frontier AI in your hands.", "sample.mp3"],
44
+ ["Hello, world!", "voice.wav"],
45
+ ["This is a test.", "4languages.mp3"],
46
+ ]
47
+
48
+ with gr.Blocks() as demo:
49
+ gr.Markdown("## Voxtral TTS Demo", elem_classes="markdown")
50
+ gr.Markdown("Voxtral TTS is a text-to-speech model that can clone voice from any reference audio. Learn more about [Voxtral TTS](https://huggingface.co/mistralai/Voxtral-3B-TTS-2603).", elem_classes="markdown")
51
+ with gr.Tabs():
52
+ with gr.TabItem("Predefined Voices"):
53
+ gr.Markdown("# Predefined Voices TTS", elem_classes="markdown")
54
+ gr.Markdown("Enter text to synthesize and select a predefined voice.", elem_classes="markdown")
55
+ with gr.Row():
56
+ with gr.Column(elem_classes="gradio-box"):
57
+ input_text_predefined = gr.Textbox(
58
+ label="Enter text to synthesize",
59
+ placeholder="Frontier AI in your hands.",
60
+ elem_classes="gradio-textbox"
61
+ )
62
+ audio_choice = gr.Dropdown(
63
+ label="Select a predefined voice",
64
+ choices=["sample.mp3", "voice.wav", "4languages.mp3"],
65
+ value="4languages.mp3",
66
+ )
67
+ submit_btn_predefined = gr.Button("Generate Audio", elem_classes="gradio-button")
68
+ with gr.Column(elem_classes="gradio-box"):
69
+ output_audio_predefined = gr.Audio(label="Generated audio", elem_classes="gradio-audio")
70
+
71
+ submit_btn_predefined.click(
72
+ fn=lambda text, choice: gradio_tts(text, choice, None),
73
+ inputs=[input_text_predefined, audio_choice],
74
+ outputs=[output_audio_predefined],
75
+ )
76
+ gr.Examples(
77
+ examples=examples,
78
+ inputs=[input_text_predefined, audio_choice],
79
+ outputs=[output_audio_predefined],
80
+ fn=lambda text, choice: gradio_tts(text, choice, None),
81
+ cache_examples=True,
82
+ )
83
+ with gr.TabItem("Voice Cloning"):
84
+ gr.Markdown("# Voice Cloning TTS", elem_classes="markdown")
85
+ gr.Markdown("Enter text to synthesize and upload your reference audio.", elem_classes="markdown")
86
+ with gr.Row():
87
+ with gr.Column(elem_classes="gradio-box"):
88
+ input_text_cloning = gr.Textbox(
89
+ label="Enter text to synthesize",
90
+ placeholder="Frontier AI in your hands.",
91
+ elem_classes="gradio-textbox"
92
+ )
93
+ uploaded_audio = gr.Audio(
94
+ label="Upload your reference audio",
95
+ type="filepath",
96
+ sources=["upload"],
97
+ elem_classes="gradio-audio"
98
+ )
99
+ submit_btn_cloning = gr.Button("Generate Audio", elem_classes="gradio-button")
100
+ with gr.Column(elem_classes="gradio-box"):
101
+ output_audio_cloning = gr.Audio(label="Generated audio", elem_classes="gradio-audio")
102
+
103
+ submit_btn_cloning.click(
104
+ fn=lambda text, audio: gradio_tts(text, None, audio),
105
+ inputs=[input_text_cloning, uploaded_audio],
106
+ outputs=[output_audio_cloning],
107
+ )
108
+ gr.Examples(
109
+ examples=[
110
+ ["Frontier AI in your hands.", "sample.mp3"],
111
+ ["Hello, world!", "voice.wav"],
112
+ ],
113
+ inputs=[input_text_cloning, uploaded_audio],
114
+ outputs=[output_audio_cloning],
115
+ fn=lambda text, audio: gradio_tts(text, None, audio),
116
+ cache_examples=True,
117
+ )
118
+
119
+ if __name__ == "__main__":
120
+ demo.launch(share=False, css=css)
styles.css ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600&family=Inter:wght@400;500;600;700&display=swap');
2
+
3
+ /* Light mode (default) */
4
+ :root {
5
+ --bg-color: #FFFAEB;
6
+ --bg-grid-color: #E9E2CB;
7
+ --card-bg-color: #FFFAEB;
8
+ --card-border-color: #E9E2CB;
9
+ --text-color: #1E1E1E;
10
+ --secondary-text-color: #444444;
11
+ --accent-color: #FF8205;
12
+ --box-shadow-color: rgba(0, 0, 0, 0.08);
13
+ --grid-opacity: 0.05;
14
+ }
15
+
16
+ /* Dark mode (activated by adding `dark` class to body or .gradio-container) */
17
+ .dark {
18
+ --bg-color: #121212;
19
+ --bg-grid-color: rgba(255, 255, 255, 0.05);
20
+ --card-bg-color: #1E1E1E;
21
+ --card-border-color: #333333;
22
+ --text-color: #FFFFFF;
23
+ --secondary-text-color: #CCCCCC;
24
+ --accent-color: #FF8205;
25
+ --box-shadow-color: rgba(0, 0, 0, 0.3);
26
+ --grid-opacity: 0.02;
27
+ }
28
+
29
+ body, .gradio-container {
30
+ background-color: var(--bg-color) !important;
31
+ background-image:
32
+ linear-gradient(var(--bg-grid-color) 1px, transparent 1px),
33
+ linear-gradient(90deg, var(--bg-grid-color) 1px, transparent 1px) !important;
34
+ background-size: 40px 40px !important;
35
+ font-family: 'Inter', sans-serif !important;
36
+ color: var(--text-color) !important;
37
+ padding: 0 !important;
38
+ margin: 0 !important;
39
+ }
40
+
41
+ .gradio-container {
42
+ max-width: 100% !important;
43
+ padding: 2rem !important;
44
+ }
45
+
46
+ #transcribe_audio {
47
+ background: linear-gradient(135deg, var(--card-bg-color) 0%, #FFF0C3 100%) !important;
48
+ border: 2px solid var(--card-border-color) !important;
49
+ border-top: 4px solid var(--accent-color) !important;
50
+ padding: 2rem !important;
51
+ box-shadow: 0 4px 24px var(--box-shadow-color) !important;
52
+ }
53
+
54
+ h1 {
55
+ font-size: 2rem !important;
56
+ font-weight: 700 !important;
57
+ color: var(--text-color) !important;
58
+ margin: 0 0 0.5rem 0 !important;
59
+ letter-spacing: -0.02em !important;
60
+ }
61
+
62
+ .markdown {
63
+ font-family: 'Inter', sans-serif !important;
64
+ color: var(--secondary-text-color) !important;
65
+ }
66
+
67
+ .gradio-box {
68
+ background: var(--card-bg-color) !important;
69
+ border: 2px solid var(--card-border-color) !important;
70
+ box-shadow: 0 8px 32px var(--box-shadow-color) !important;
71
+ overflow: hidden !important;
72
+ padding: 1rem !important;
73
+ }
74
+
75
+ .gradio-textbox {
76
+ font-family: 'JetBrains Mono', monospace !important;
77
+ font-size: 1.1rem !important;
78
+ line-height: 1.8 !important;
79
+ color: var(--text-color) !important;
80
+ white-space: pre-wrap !important;
81
+ word-break: break-word !important;
82
+ text-align: left !important;
83
+ min-height: 200px !important;
84
+ background-color: var(--card-bg-color) !important;
85
+ background-image:
86
+ linear-gradient(rgba(0, 0, 0, var(--grid-opacity)) 1px, transparent 1px),
87
+ linear-gradient(90deg, rgba(0, 0, 0, var(--grid-opacity)) 1px, transparent 1px) !important;
88
+ background-size: 20px 20px !important;
89
+ padding: 1rem !important;
90
+ border: 1px solid var(--card-border-color) !important;
91
+ }
92
+
93
+ .gradio-button {
94
+ background: var(--accent-color) !important;
95
+ color: white !important;
96
+ border: none !important;
97
+ padding: 0.75rem 1.5rem !important;
98
+ font-weight: 600 !important;
99
+ text-transform: uppercase !important;
100
+ letter-spacing: 0.05em !important;
101
+ font-size: 0.85rem !important;
102
+ cursor: pointer !important;
103
+ transition: all 0.2s !important;
104
+ }
105
+
106
+ .gradio-button:hover {
107
+ background: #E67200 !important;
108
+ }
109
+
110
+ .gradio-audio {
111
+ margin: 0rem !important;
112
+ min-height: 250px !important;
113
+ }
114
+
115
+ footer {
116
+ display: none !important;
117
+ }
tts.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import io
5
+ import pathlib
6
+ import shutil
7
+ import struct
8
+ import sys
9
+ import wave
10
+ import aiofiles
11
+ import aiohttp
12
+ from typing import Optional
13
+ import asyncio
14
+
15
+ async def synthesize_and_play_audio(
16
+ input_text: str = "Hello!",
17
+ reference_audio_path: str = "~/Downloads/jmsample.wav",
18
+ model: str = "voxtral-mini-tts-260213",
19
+ api_key: str = "MISTRAL_API_KEY",
20
+ output_path: str = "/tmp/voxtral.wav",
21
+ timeout: float = 180.0,
22
+ raw_sample_rate: int = 24000,
23
+ raw_channels: int = 1,
24
+ no_play: bool = False,
25
+ url: str = "https://api.mistral.ai/v1/audio/text-to-speech",
26
+ reference_format: str = "raw-base64",
27
+ ) -> int:
28
+ """
29
+ Asynchronously synthesize speech from input text using a reference audio file and play the output.
30
+
31
+ Args:
32
+ input_text: Text to synthesize.
33
+ reference_audio_path: Path to the reference WAV file.
34
+ model: Model name sent in the request.
35
+ api_key: API key for authentication.
36
+ output_path: Output audio file path.
37
+ timeout: HTTP timeout in seconds.
38
+ raw_sample_rate: Sample rate used when response audio is raw f32le (non-WAV).
39
+ raw_channels: Channel count used when response audio is raw f32le (non-WAV).
40
+ no_play: If True, only save audio, do not launch a player.
41
+ url: TTS endpoint URL.
42
+ reference_format: How to serialize reference_audio ("data-uri" or "raw-base64").
43
+
44
+ Returns:
45
+ int: 0 on success, non-zero on failure.
46
+ """
47
+ print(f"Synthesizing: {input_text!r}")
48
+
49
+ try:
50
+ # Use async file operations
51
+ reference_path = pathlib.Path(reference_audio_path).expanduser().resolve()
52
+ if not reference_path.is_file():
53
+ print(f"Reference audio not found: {reference_path}", file=sys.stderr)
54
+ return 2
55
+
56
+ # Read reference audio asynchronously
57
+ async with aiofiles.open(reference_path, 'rb') as f:
58
+ reference_bytes = await f.read()
59
+
60
+ reference_b64 = base64.b64encode(reference_bytes).decode("ascii")
61
+ if reference_format == "data-uri":
62
+ reference_audio = f"data:audio/wav;base64,{reference_b64}"
63
+ else:
64
+ reference_audio = reference_b64
65
+
66
+ payload = {
67
+ "model": model,
68
+ "input": input_text,
69
+ "reference_audio": reference_audio,
70
+ "response_format": "wav",
71
+ }
72
+ headers = {
73
+ "content-type": "application/json",
74
+ "x-api-key": api_key,
75
+ }
76
+
77
+ # Use async HTTP client
78
+ async with aiohttp.ClientSession() as session:
79
+ try:
80
+ async with session.post(url, headers=headers, json=payload, timeout=timeout) as response:
81
+ if response.status >= 400:
82
+ error_text = await response.text()
83
+ print(f"Request failed: {response.status}", file=sys.stderr)
84
+ print(error_text, file=sys.stderr)
85
+ return 1
86
+
87
+ content_type = response.headers.get("content-type", "")
88
+ audio_bytes = await response.read()
89
+
90
+ if "application/json" in content_type.lower():
91
+ body = await response.json()
92
+ audio_field = body.get("audio") if isinstance(body, dict) else None
93
+ if not isinstance(audio_field, str) or not audio_field:
94
+ print("JSON response does not contain an 'audio' field.", file=sys.stderr)
95
+ print(body, file=sys.stderr)
96
+ return 1
97
+
98
+ if "," in audio_field and audio_field.startswith("data:"):
99
+ audio_field = audio_field.split(",", 1)[1]
100
+ try:
101
+ audio_bytes = base64.b64decode(audio_field, validate=False)
102
+ except Exception as exc:
103
+ print(f"Failed to decode JSON audio field as base64: {exc}", file=sys.stderr)
104
+ return 1
105
+
106
+ if not _looks_like_wav(audio_bytes):
107
+ converted = _convert_f32le_to_wav(
108
+ audio_bytes,
109
+ sample_rate=raw_sample_rate,
110
+ channels=raw_channels,
111
+ )
112
+ if converted is not None:
113
+ audio_bytes = converted
114
+ print(
115
+ "Response was non-WAV raw audio; converted f32le stream to WAV.",
116
+ file=sys.stderr,
117
+ )
118
+ else:
119
+ print(
120
+ "Response bytes are non-WAV and could not be auto-converted.",
121
+ file=sys.stderr,
122
+ )
123
+
124
+ # Write output file asynchronously
125
+ output_path_obj = pathlib.Path(output_path).expanduser()
126
+ async with aiofiles.open(output_path_obj, 'wb') as f:
127
+ await f.write(audio_bytes)
128
+ print(f"Wrote {len(audio_bytes)} bytes to {output_path_obj}")
129
+
130
+ if not no_play:
131
+ await maybe_play_audio_async(output_path_obj)
132
+
133
+ return 0
134
+
135
+ except asyncio.TimeoutError:
136
+ print(f"Request timed out after {timeout} seconds", file=sys.stderr)
137
+ return 1
138
+ except aiohttp.ClientError as e:
139
+ print(f"HTTP client error: {e}", file=sys.stderr)
140
+ return 1
141
+
142
+ except Exception as e:
143
+ print(f"Unexpected error: {e}", file=sys.stderr)
144
+ return 1
145
+
146
+ def _looks_like_wav(audio_bytes: bytes) -> bool:
147
+ return len(audio_bytes) >= 12 and audio_bytes[:4] == b"RIFF" and audio_bytes[8:12] == b"WAVE"
148
+
149
+ def _convert_f32le_to_wav(audio_bytes: bytes, sample_rate: int, channels: int) -> Optional[bytes]:
150
+ if len(audio_bytes) % 4 != 0:
151
+ return None
152
+ if channels <= 0 or sample_rate <= 0:
153
+ return None
154
+
155
+ pcm16 = bytearray()
156
+ for (sample,) in struct.iter_unpack("<f", audio_bytes):
157
+ clipped = max(-1.0, min(1.0, sample))
158
+ pcm16.extend(struct.pack("<h", int(clipped * 32767)))
159
+
160
+ buffer = io.BytesIO()
161
+ with wave.open(buffer, "wb") as wav_file:
162
+ wav_file.setnchannels(channels)
163
+ wav_file.setsampwidth(2)
164
+ wav_file.setframerate(sample_rate)
165
+ wav_file.writeframes(bytes(pcm16))
166
+ return buffer.getvalue()
167
+
168
+ async def maybe_play_audio_async(path: pathlib.Path) -> None:
169
+ """Asynchronously play audio using available players."""
170
+ players = (
171
+ ("afplay", [str(path)]),
172
+ ("ffplay", ["-nodisp", "-autoexit", str(path)]),
173
+ ("mpv", [str(path)]),
174
+ )
175
+
176
+ for command, extra_args in players:
177
+ if shutil.which(command) is None:
178
+ continue
179
+ try:
180
+ proc = await asyncio.create_subprocess_exec(
181
+ command, *extra_args,
182
+ stdout=asyncio.subprocess.PIPE,
183
+ stderr=asyncio.subprocess.PIPE
184
+ )
185
+ await proc.wait()
186
+ if proc.returncode == 0:
187
+ return
188
+ print(
189
+ f"Player '{command}' failed with exit code {proc.returncode}, trying next player.",
190
+ file=sys.stderr,
191
+ )
192
+ except Exception as e:
193
+ print(f"Error with player '{command}': {e}", file=sys.stderr)
194
+ continue
195
+
196
+ print("No local audio player found (afplay/ffplay/mpv).", file=sys.stderr)