Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 19 days ago

Commit

1d42836

1 Parent(s): c0f2a13

add auto-captioning (BPM/key/signature via librosa), add librosa+mutagen deps

Browse files

Files changed (2) hide show

Dockerfile +1 -1
train_engine.py +271 -2

Dockerfile CHANGED Viewed

@@ -78,7 +78,7 @@ RUN pip3 install --no-cache-dir --extra-index-url https://download.pytorch.org/w
     "gradio[mcp]==5.29.0" requests torch safetensors \
     "transformers>=4.51.0,<4.58.0" peft>=0.18.0 \
     loguru "torchaudio==2.4.0" "diffusers==0.30.3" lightning numpy tensorboard soundfile \
-    einops vector_quantize_pytorch
 # Clone ACE-Step repo for training module
 RUN git clone --depth 1 https://github.com/ace-step/ACE-Step-1.5 /app/ace-step-source

     "gradio[mcp]==5.29.0" requests torch safetensors \
     "transformers>=4.51.0,<4.58.0" peft>=0.18.0 \
     loguru "torchaudio==2.4.0" "diffusers==0.30.3" lightning numpy tensorboard soundfile \
+    einops vector_quantize_pytorch librosa mutagen
 # Clone ACE-Step repo for training module
 RUN git clone --depth 1 https://github.com/ace-step/ACE-Step-1.5 /app/ace-step-source

train_engine.py CHANGED Viewed

@@ -19,9 +19,11 @@ import logging
 import math
 import os
 import random
 import sys
 import time
 import types
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
@@ -771,6 +773,259 @@ def _detect_max_duration(files: List[Path]) -> float:
     return min(max_dur if max_dur > 0 else MAX_AUDIO_DURATION, MAX_AUDIO_DURATION)
 # ============================================================================
 # PREPROCESSING (2-pass sequential)
 # ============================================================================
@@ -846,8 +1101,22 @@ def preprocess_audio(
                 lat_len = target_latents.shape[1]
                 att_mask = torch.ones(1, lat_len, device=device, dtype=dtype)
-                caption = af.stem
-                lyrics = "[Instrumental]"
                 text_prompt = caption
                 with torch.no_grad():

 import math
 import os
 import random
+import re
 import sys
 import time
 import types
+import unicodedata
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
     return min(max_dur if max_dur > 0 else MAX_AUDIO_DURATION, MAX_AUDIO_DURATION)
+# ============================================================================
+# AUDIO ANALYSIS (ported from Side-Step, faf mode only -- CPU, ~2-3s/file)
+# ============================================================================
+# Krumhansl key profile for single-profile key detection
+_KEY_PROFILE_MAJOR = [6.35, 2.23, 3.48, 2.33, 4.38, 4.09,
+                      2.52, 5.19, 2.39, 3.66, 2.29, 2.88]
+_KEY_PROFILE_MINOR = [6.33, 2.68, 3.52, 5.38, 2.60, 3.53,
+                      2.54, 4.75, 3.98, 2.69, 3.34, 3.17]
+_PITCH_CLASSES = ["C", "C#", "D", "D#", "E", "F",
+                  "F#", "G", "G#", "A", "A#", "B"]
+# Filename pattern: "Artist - Title"
+_FILENAME_RE = re.compile(r"^(.+?)\s*[-–—]\s*(.+)$")
+def _octave_correct_bpm(bpm: float, lo: float = 70.0, hi: float = 180.0) -> float:
+    """Fold BPM into the musical sweet-spot range [lo, hi]."""
+    if bpm <= 0:
+        return bpm
+    candidate = bpm
+    while candidate > hi:
+        candidate /= 2.0
+    while candidate < lo:
+        candidate *= 2.0
+    if candidate < lo or candidate > hi:
+        return bpm
+    return candidate
+def _detect_bpm_faf(y, sr) -> Optional[int]:
+    """Detect BPM using librosa beat_track + octave correction (faf mode)."""
+    import librosa
+    import numpy as np
+    try:
+        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
+        val = float(np.atleast_1d(tempo)[0])
+        if val > 0:
+            return int(round(_octave_correct_bpm(val)))
+    except Exception:
+        pass
+    return None
+def _detect_key_faf(y, sr) -> Optional[str]:
+    """Detect key using Krumhansl profile on chroma_cens (faf mode)."""
+    import librosa
+    import numpy as np
+    try:
+        y_harmonic = librosa.effects.harmonic(y, margin=2.0)
+        chroma = librosa.feature.chroma_cens(y=y_harmonic, sr=sr)
+        # Energy-weighted average chroma
+        rms = librosa.feature.rms(y=y_harmonic, frame_length=2048, hop_length=512)
+        rms_vec = rms[0]
+        min_len = min(chroma.shape[1], len(rms_vec))
+        chroma = chroma[:, :min_len]
+        rms_vec = rms_vec[:min_len]
+        weights = rms_vec / (rms_vec.sum() + 1e-10)
+        chroma_avg = (chroma * weights[None, :]).sum(axis=1)
+        s = chroma_avg.sum()
+        if s == 0:
+            return None
+        chroma_avg = chroma_avg / s
+        major_norm = np.array(_KEY_PROFILE_MAJOR)
+        major_norm = major_norm / major_norm.sum()
+        minor_norm = np.array(_KEY_PROFILE_MINOR)
+        minor_norm = minor_norm / minor_norm.sum()
+        best_corr = -2.0
+        best_key = "C major"
+        for shift in range(12):
+            rotated = np.roll(chroma_avg, -shift)
+            corr_maj = float(np.corrcoef(rotated, major_norm)[0, 1])
+            if corr_maj > best_corr:
+                best_corr = corr_maj
+                best_key = f"{_PITCH_CLASSES[shift]} major"
+            corr_min = float(np.corrcoef(rotated, minor_norm)[0, 1])
+            if corr_min > best_corr:
+                best_corr = corr_min
+                best_key = f"{_PITCH_CLASSES[shift]} minor"
+        return best_key
+    except Exception:
+        return None
+def _detect_time_signature_faf() -> str:
+    """Faf mode returns hardcoded 4/4 (correct ~80%+ of the time)."""
+    return "4/4"
+def _sanitize_tag(value: str) -> str:
+    """Normalize a tag value: NFKC normalize, strip invisible chars."""
+    value = unicodedata.normalize("NFKC", value)
+    value = (
+        value
+        .replace("", "").replace("", "")
+        .replace("", "").replace("‌", "")
+        .replace("‍", "").replace("‎", "")
+        .replace("‏", "").replace("‪", "")
+        .replace("‬", "")
+    )
+    value = "".join(
+        c for c in value
+        if c in ("\n", "\r", "\t", " ") or unicodedata.category(c)[0] != "C"
+    )
+    return value.strip()
+def _extract_metadata_from_tags(audio_path: Path) -> tuple:
+    """Extract (title, artist) from audio tags via mutagen, fallback to filename."""
+    title, artist = None, None
+    try:
+        import mutagen
+        mf = mutagen.File(str(audio_path))
+        if mf is not None and mf.tags is not None:
+            # ID3 (MP3, AIFF)
+            for key in ("TIT2",):
+                val = mf.tags.get(key)
+                if val:
+                    title = _sanitize_tag(str(val))
+                    break
+            for key in ("TPE1", "TPE2"):
+                val = mf.tags.get(key)
+                if val:
+                    artist = _sanitize_tag(str(val))
+                    break
+            # Vorbis (FLAC, OGG) and MP4 atoms
+            if title is None:
+                for key in ("title", "\xa9nam"):
+                    vals = mf.tags.get(key)
+                    if vals:
+                        raw = str(vals[0]) if isinstance(vals, list) else str(vals)
+                        title = _sanitize_tag(raw)
+                        break
+            if artist is None:
+                for key in ("artist", "\xa9ART", "albumartist", "aART"):
+                    vals = mf.tags.get(key)
+                    if vals:
+                        raw = str(vals[0]) if isinstance(vals, list) else str(vals)
+                        artist = _sanitize_tag(raw)
+                        break
+    except Exception:
+        pass
+    # Fallback to filename parsing
+    if not title:
+        stem = audio_path.stem
+        match = _FILENAME_RE.match(stem)
+        if match:
+            artist = artist or match.group(1).strip()
+            title = match.group(2).strip()
+        else:
+            title = stem.strip()
+    return title or audio_path.stem, artist or ""
+def analyze_and_caption(audio_path: str, mode: str = "faf") -> Dict[str, Any]:
+    """Analyze an audio file and build a training caption.
+    Uses faf mode only (CPU, ~2-3s per file): librosa beat_track for BPM,
+    Krumhansl chroma for key, hardcoded 4/4 time signature.
+    Args:
+        audio_path: Path to the audio file.
+        mode: Analysis mode (only "faf" supported).
+    Returns:
+        Dict with keys: caption, bpm, key, signature, lyrics, title, artist.
+    """
+    import librosa
+    import numpy as np
+    audio_path = Path(audio_path)
+    # Load audio once, reuse for all detectors
+    try:
+        y, sr = librosa.load(str(audio_path), sr=None, mono=True)
+        # Trim silence + peak normalize
+        y_trimmed, _ = librosa.effects.trim(y, top_db=30)
+        if len(y_trimmed) >= sr:
+            y = y_trimmed
+        peak = np.max(np.abs(y))
+        if peak > 0:
+            y = y / peak
+    except Exception as exc:
+        logger.warning("Could not load audio for analysis: %s: %s", audio_path.name, exc)
+        title, artist = _extract_metadata_from_tags(audio_path)
+        return {
+            "caption": f"A track by {artist}" if artist else f"A track titled {title}",
+            "bpm": None, "key": None, "signature": "4/4",
+            "lyrics": "[Instrumental]", "title": title, "artist": artist,
+        }
+    bpm = _detect_bpm_faf(y, sr)
+    key = _detect_key_faf(y, sr)
+    signature = _detect_time_signature_faf()
+    title, artist = _extract_metadata_from_tags(audio_path)
+    # Build caption string for ACE-Step training
+    parts = ["A"]
+    if artist:
+        parts.append(f"track by {artist}")
+    else:
+        parts.append("track")
+    if bpm:
+        parts.append(f"at {bpm} BPM")
+    if key:
+        parts.append(f"in {key}")
+    parts.append(f"{signature} time")
+    caption = " ".join(parts)
+    lyrics = "[Instrumental]"
+    result = {
+        "caption": caption,
+        "bpm": bpm,
+        "key": key,
+        "signature": signature,
+        "lyrics": lyrics,
+        "title": title,
+        "artist": artist,
+    }
+    logger.info("Auto-caption for %s: %s", audio_path.name, caption)
+    return result
+def _write_caption_sidecar(audio_path: Path, analysis: Dict[str, Any]) -> Path:
+    """Write analysis results as a .json sidecar next to the audio file."""
+    sidecar_path = audio_path.with_suffix(".json")
+    with open(sidecar_path, "w", encoding="utf-8") as f:
+        json.dump(analysis, f, indent=2, ensure_ascii=False)
+    logger.info("Wrote caption sidecar: %s", sidecar_path)
+    return sidecar_path
+def _read_caption_sidecar(audio_path: Path) -> Optional[Dict[str, Any]]:
+    """Read an existing .json caption sidecar if it exists."""
+    sidecar_path = audio_path.with_suffix(".json")
+    if not sidecar_path.is_file():
+        return None
+    try:
+        with open(sidecar_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception:
+        return None
 # ============================================================================
 # PREPROCESSING (2-pass sequential)
 # ============================================================================
                 lat_len = target_latents.shape[1]
                 att_mask = torch.ones(1, lat_len, device=device, dtype=dtype)
+                # Auto-caption: read existing sidecar or analyze
+                sidecar = _read_caption_sidecar(af)
+                if sidecar and sidecar.get("caption"):
+                    caption = sidecar["caption"]
+                    lyrics = sidecar.get("lyrics", "[Instrumental]")
+                    logger.info("Using existing caption for %s", af.name)
+                else:
+                    try:
+                        analysis = analyze_and_caption(str(af))
+                        caption = analysis["caption"]
+                        lyrics = analysis.get("lyrics", "[Instrumental]")
+                        _write_caption_sidecar(af, analysis)
+                    except Exception as exc:
+                        logger.warning("Auto-caption failed for %s: %s, using filename", af.name, exc)
+                        caption = af.stem
+                        lyrics = "[Instrumental]"
                 text_prompt = caption
                 with torch.no_grad():