Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 19 days ago

Commit

b38d0b1

1 Parent(s): 1d42836

add mid/sas analysis modes (Demucs + ensemble), auto-select by dataset size

Browse files

Files changed (1) hide show

train_engine.py +989 -65

train_engine.py CHANGED Viewed

@@ -20,7 +20,9 @@ import math
 import os
 import random
 import re
 import sys
 import time
 import types
 import unicodedata
@@ -774,14 +776,43 @@ def _detect_max_duration(files: List[Path]) -> float:
 # ============================================================================
-# AUDIO ANALYSIS (ported from Side-Step, faf mode only -- CPU, ~2-3s/file)
 # ============================================================================
-# Krumhansl key profile for single-profile key detection
-_KEY_PROFILE_MAJOR = [6.35, 2.23, 3.48, 2.33, 4.38, 4.09,
-                      2.52, 5.19, 2.39, 3.66, 2.29, 2.88]
-_KEY_PROFILE_MINOR = [6.33, 2.68, 3.52, 5.38, 2.60, 3.53,
-                      2.54, 4.75, 3.98, 2.69, 3.34, 3.17]
 _PITCH_CLASSES = ["C", "C#", "D", "D#", "E", "F",
                   "F#", "G", "G#", "A", "A#", "B"]
@@ -789,6 +820,141 @@ _PITCH_CLASSES = ["C", "C#", "D", "D#", "E", "F",
 _FILENAME_RE = re.compile(r"^(.+?)\s*[-–—]\s*(.+)$")
 def _octave_correct_bpm(bpm: float, lo: float = 70.0, hi: float = 180.0) -> float:
     """Fold BPM into the musical sweet-spot range [lo, hi]."""
     if bpm <= 0:
@@ -803,68 +969,711 @@ def _octave_correct_bpm(bpm: float, lo: float = 70.0, hi: float = 180.0) -> floa
     return candidate
-def _detect_bpm_faf(y, sr) -> Optional[int]:
-    """Detect BPM using librosa beat_track + octave correction (faf mode)."""
     import librosa
     import numpy as np
     try:
-        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
-        val = float(np.atleast_1d(tempo)[0])
-        if val > 0:
-            return int(round(_octave_correct_bpm(val)))
     except Exception:
         pass
-    return None
-def _detect_key_faf(y, sr) -> Optional[str]:
-    """Detect key using Krumhansl profile on chroma_cens (faf mode)."""
     import librosa
     import numpy as np
     try:
-        y_harmonic = librosa.effects.harmonic(y, margin=2.0)
-        chroma = librosa.feature.chroma_cens(y=y_harmonic, sr=sr)
-        # Energy-weighted average chroma
-        rms = librosa.feature.rms(y=y_harmonic, frame_length=2048, hop_length=512)
-        rms_vec = rms[0]
-        min_len = min(chroma.shape[1], len(rms_vec))
-        chroma = chroma[:, :min_len]
-        rms_vec = rms_vec[:min_len]
-        weights = rms_vec / (rms_vec.sum() + 1e-10)
-        chroma_avg = (chroma * weights[None, :]).sum(axis=1)
-        s = chroma_avg.sum()
-        if s == 0:
-            return None
-        chroma_avg = chroma_avg / s
-        major_norm = np.array(_KEY_PROFILE_MAJOR)
-        major_norm = major_norm / major_norm.sum()
-        minor_norm = np.array(_KEY_PROFILE_MINOR)
-        minor_norm = minor_norm / minor_norm.sum()
-        best_corr = -2.0
-        best_key = "C major"
-        for shift in range(12):
-            rotated = np.roll(chroma_avg, -shift)
-            corr_maj = float(np.corrcoef(rotated, major_norm)[0, 1])
-            if corr_maj > best_corr:
-                best_corr = corr_maj
-                best_key = f"{_PITCH_CLASSES[shift]} major"
-            corr_min = float(np.corrcoef(rotated, minor_norm)[0, 1])
-            if corr_min > best_corr:
-                best_corr = corr_min
-                best_key = f"{_PITCH_CLASSES[shift]} minor"
-        return best_key
-    except Exception:
         return None
-def _detect_time_signature_faf() -> str:
-    """Faf mode returns hardcoded 4/4 (correct ~80%+ of the time)."""
-    return "4/4"
 def _sanitize_tag(value: str) -> str:
@@ -934,24 +1743,39 @@ def _extract_metadata_from_tags(audio_path: Path) -> tuple:
     return title or audio_path.stem, artist or ""
-def analyze_and_caption(audio_path: str, mode: str = "faf") -> Dict[str, Any]:
     """Analyze an audio file and build a training caption.
-    Uses faf mode only (CPU, ~2-3s per file): librosa beat_track for BPM,
-    Krumhansl chroma for key, hardcoded 4/4 time signature.
     Args:
         audio_path: Path to the audio file.
-        mode: Analysis mode (only "faf" supported).
     Returns:
-        Dict with keys: caption, bpm, key, signature, lyrics, title, artist.
     """
     import librosa
     import numpy as np
     audio_path = Path(audio_path)
     # Load audio once, reuse for all detectors
     try:
         y, sr = librosa.load(str(audio_path), sr=None, mono=True)
@@ -969,11 +1793,76 @@ def analyze_and_caption(audio_path: str, mode: str = "faf") -> Dict[str, Any]:
             "caption": f"A track by {artist}" if artist else f"A track titled {title}",
             "bpm": None, "key": None, "signature": "4/4",
             "lyrics": "[Instrumental]", "title": title, "artist": artist,
         }
-    bpm = _detect_bpm_faf(y, sr)
-    key = _detect_key_faf(y, sr)
-    signature = _detect_time_signature_faf()
     title, artist = _extract_metadata_from_tags(audio_path)
     # Build caption string for ACE-Step training
@@ -999,9 +1888,10 @@ def analyze_and_caption(audio_path: str, mode: str = "faf") -> Dict[str, Any]:
         "lyrics": lyrics,
         "title": title,
         "artist": artist,
     }
-    logger.info("Auto-caption for %s: %s", audio_path.name, caption)
     return result
@@ -1106,15 +1996,49 @@ def preprocess_audio(
                 if sidecar and sidecar.get("caption"):
                     caption = sidecar["caption"]
                     lyrics = sidecar.get("lyrics", "[Instrumental]")
-                    logger.info("Using existing caption for %s", af.name)
                 else:
                     try:
-                        analysis = analyze_and_caption(str(af))
                         caption = analysis["caption"]
                         lyrics = analysis.get("lyrics", "[Instrumental]")
                         _write_caption_sidecar(af, analysis)
                     except Exception as exc:
-                        logger.warning("Auto-caption failed for %s: %s, using filename", af.name, exc)
                         caption = af.stem
                         lyrics = "[Instrumental]"
                 text_prompt = caption

 import os
 import random
 import re
+import shutil
 import sys
+import tempfile
 import time
 import types
 import unicodedata
 # ============================================================================
+# AUDIO ANALYSIS (ported from Side-Step -- faf / mid / sas modes)
 # ============================================================================
+#
+# faf  ("Fast As F*ck")       ~2-3 s/file   - single-method, no Demucs
+# mid                         ~12 s/file     - 3-method ensemble, Demucs stems
+# sas  ("Smart/Slow As Sh*t") ~30 s/file     - deep multi-technique + chunked
+#
+# Demucs on CPU is SLOW (~2-5 min/file).  mid/sas are designed for GPU
+# stem separation but will still work on CPU -- just much slower.
+# ============================================================================
+_ANALYSIS_MODES = ("faf", "mid", "sas")
+_SAS_NUM_CHUNKS = 5
+_SAS_CHUNK_SECONDS = 15  # seconds per analysis window
+# Key profile families for multi-profile voting (mid / sas)
+_KEY_PROFILES = {
+    "krumhansl": {
+        "major": [6.35, 2.23, 3.48, 2.33, 4.38, 4.09,
+                  2.52, 5.19, 2.39, 3.66, 2.29, 2.88],
+        "minor": [6.33, 2.68, 3.52, 5.38, 2.60, 3.53,
+                  2.54, 4.75, 3.98, 2.69, 3.34, 3.17],
+    },
+    "temperley": {
+        "major": [5.0, 2.0, 3.5, 2.0, 4.5, 4.0,
+                  2.0, 4.5, 2.0, 3.5, 1.5, 4.0],
+        "minor": [5.0, 2.0, 3.5, 4.5, 2.0, 3.5,
+                  2.0, 4.5, 3.5, 2.0, 1.5, 4.0],
+    },
+    "albrecht": {
+        "major": [0.238, 0.006, 0.111, 0.006, 0.137, 0.094,
+                  0.016, 0.214, 0.009, 0.080, 0.008, 0.081],
+        "minor": [0.220, 0.006, 0.104, 0.123, 0.019, 0.103,
+                  0.012, 0.214, 0.062, 0.022, 0.061, 0.052],
+    },
+}
 _PITCH_CLASSES = ["C", "C#", "D", "D#", "E", "F",
                   "F#", "G", "G#", "A", "A#", "B"]
 _FILENAME_RE = re.compile(r"^(.+?)\s*[-–—]\s*(.+)$")
+# ---- Demucs stem separation (mid / sas) --------------------------------
+def separate_stems(
+    audio_path: Path,
+    tmp_dir: Path,
+    device: str = "cpu",
+) -> Tuple[Path, Path]:
+    """Run Demucs HTDemucs and return (drums_path, harmonics_path).
+    Harmonics = bass + other stems summed.  Vocals are discarded.
+    WARNING: On CPU this takes ~2-5 minutes per file.
+    """
+    import torchaudio
+    from demucs.pretrained import get_model
+    from demucs.apply import apply_model
+    torch_device = torch.device(device)
+    logger.info("Loading Demucs HTDemucs model on %s", device)
+    if device == "cpu":
+        logger.warning(
+            "Demucs on CPU is slow (~2-5 min per file). "
+            "Consider using 'faf' mode or running on a GPU machine."
+        )
+    model = get_model("htdemucs")
+    model.to(torch_device)
+    model.eval()
+    wav, sr = torchaudio.load(str(audio_path))
+    # Resample to model's expected rate (44100 Hz) if needed
+    if sr != model.samplerate:
+        wav = torchaudio.functional.resample(wav, sr, model.samplerate)
+        sr = model.samplerate
+    # HTDemucs requires stereo input
+    if wav.shape[0] == 1:
+        wav = wav.repeat(2, 1)
+    wav = wav.unsqueeze(0).to(torch_device)
+    logger.info("Separating stems for %s", audio_path.name)
+    with torch.no_grad():
+        sources = apply_model(model, wav, device=torch_device)
+    source_map = {name: i for i, name in enumerate(model.sources)}
+    drums = sources[0, source_map["drums"]].cpu()
+    bass = sources[0, source_map["bass"]].cpu()
+    other = sources[0, source_map["other"]].cpu()
+    harmonics = bass + other
+    drums_path = tmp_dir / "drums.wav"
+    harmonics_path = tmp_dir / "harmonics.wav"
+    torchaudio.save(str(drums_path), drums, sr)
+    torchaudio.save(str(harmonics_path), harmonics, sr)
+    del model, sources, wav, drums, bass, other, harmonics
+    gc.collect()
+    logger.info("Stems written: %s, %s", drums_path, harmonics_path)
+    return drums_path, harmonics_path
+# ---- Chunk selection (sas mode) ----------------------------------------
+def _select_chunks(
+    y,  # np.ndarray
+    sr: int,
+    n_chunks: int = _SAS_NUM_CHUNKS,
+    chunk_sec: float = _SAS_CHUNK_SECONDS,
+    min_gap_sec: float = 10.0,
+    use_onset: bool = True,
+) -> list:
+    """Select the most informative audio chunks for sas analysis.
+    Energy-gated + spread: rank windows by onset density (or RMS),
+    discard below-median, then greedily pick chunks maximally spread apart.
+    """
+    import librosa
+    import numpy as np
+    chunk_samples = int(chunk_sec * sr)
+    hop_samples = chunk_samples // 2
+    if len(y) < chunk_samples:
+        return [y]
+    candidates = []
+    for start in range(0, len(y) - chunk_samples + 1, hop_samples):
+        window = y[start : start + chunk_samples]
+        if use_onset:
+            onset_env = librosa.onset.onset_strength(y=window, sr=sr)
+            score = float(np.mean(onset_env))
+        else:
+            score = float(np.sqrt(np.mean(window ** 2)))
+        candidates.append((start, score))
+    if not candidates:
+        return [y]
+    scores = np.array([s for _, s in candidates])
+    median_score = float(np.median(scores))
+    gated = [(start, score) for start, score in candidates if score >= median_score]
+    if not gated:
+        gated = candidates
+    gated.sort(key=lambda x: x[1], reverse=True)
+    min_gap_samples = int(min_gap_sec * sr)
+    selected_starts = []
+    for start, score in gated:
+        centre = start + chunk_samples // 2
+        too_close = any(
+            abs(centre - (s + chunk_samples // 2)) < min_gap_samples
+            for s in selected_starts
+        )
+        if not too_close:
+            selected_starts.append(start)
+            if len(selected_starts) >= n_chunks:
+                break
+    if len(selected_starts) < n_chunks:
+        for start, score in gated:
+            if start not in selected_starts:
+                selected_starts.append(start)
+                if len(selected_starts) >= n_chunks:
+                    break
+    selected_starts.sort()
+    return [y[s : s + chunk_samples] for s in selected_starts]
+# ---- BPM helpers --------------------------------------------------------
 def _octave_correct_bpm(bpm: float, lo: float = 70.0, hi: float = 180.0) -> float:
     """Fold BPM into the musical sweet-spot range [lo, hi]."""
     if bpm <= 0:
     return candidate
+def _bpm_core_ensemble(y, sr) -> list:
+    """Run the 3-method BPM ensemble on a single audio buffer (mid/sas).
+    Returns a list of octave-corrected BPM estimates.
+    """
     import librosa
     import numpy as np
+    estimates = []
+    # Method A: beat_track
     try:
+        tempo_a, _ = librosa.beat.beat_track(y=y, sr=sr)
+        val_a = float(np.atleast_1d(tempo_a)[0])
+        if val_a > 0:
+            estimates.append(_octave_correct_bpm(val_a))
+    except Exception:
+        pass
+    # Method B: tempogram peak
+    try:
+        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+        tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr)
+        avg_tempogram = np.mean(tempogram, axis=1)
+        bpm_axis = librosa.tempo_frequencies(tempogram.shape[0], sr=sr)
+        valid = (bpm_axis >= 30) & (bpm_axis <= 300)
+        if np.any(valid):
+            masked = avg_tempogram.copy()
+            masked[~valid] = 0
+            peak_idx = np.argmax(masked)
+            val_b = float(bpm_axis[peak_idx])
+            if val_b > 0:
+                estimates.append(_octave_correct_bpm(val_b))
     except Exception:
         pass
+    # Method C: onset autocorrelation
+    try:
+        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+        ac = librosa.autocorrelate(onset_env, max_size=len(onset_env))
+        hop = 512
+        min_lag = int(60.0 * sr / (300.0 * hop))
+        max_lag = int(60.0 * sr / (30.0 * hop))
+        max_lag = min(max_lag, len(ac) - 1)
+        if min_lag < max_lag and max_lag > 0:
+            segment = ac[min_lag:max_lag + 1]
+            peak_offset = np.argmax(segment)
+            peak_lag = min_lag + peak_offset
+            if peak_lag > 0:
+                val_c = 60.0 * sr / (peak_lag * hop)
+                if val_c > 0:
+                    estimates.append(_octave_correct_bpm(val_c))
+    except Exception:
+        pass
+    return estimates
+def _bpm_consensus(estimates: list) -> Tuple[Optional[int], str]:
+    """Find consensus BPM from a list of estimates + assign confidence."""
+    import numpy as np
+    if not estimates:
+        return None, "low"
+    estimates_arr = np.array(estimates)
+    best_cluster = []
+    for ref in estimates_arr:
+        cluster = [e for e in estimates_arr
+                   if abs(e - ref) / max(ref, 1) < 0.08]
+        if len(cluster) > len(best_cluster):
+            best_cluster = cluster
+    consensus = float(np.median(best_cluster)) if best_cluster else estimates[0]
+    bpm = int(round(consensus))
+    if bpm <= 0:
+        return None, "low"
+    n_agree = len(best_cluster)
+    n_total = len(estimates)
+    if n_total >= 6:
+        # sas thresholds (many data points)
+        if n_agree / n_total >= 0.7:
+            confidence = "high"
+        elif n_agree / n_total >= 0.4:
+            confidence = "medium"
+        else:
+            confidence = "low"
+    else:
+        # mid thresholds
+        if n_agree >= 3:
+            confidence = "high"
+        elif n_agree >= 2:
+            confidence = "medium"
+        else:
+            confidence = "low"
+    return bpm, confidence
+# ---- Unified BPM detection ---------------------------------------------
+def _detect_bpm(y, sr, mode: str = "faf") -> Tuple[Optional[int], str]:
+    """Detect BPM with quality controlled by mode.
+    faf: Single beat_track + octave correction.
+    mid: 3-method ensemble (beat_track + tempogram + onset-AC).
+    sas: mid ensemble + PLP + multi-hop + chunked analysis.
+    Returns (bpm, confidence).
+    """
     import librosa
     import numpy as np
     try:
+        # faf: single method
+        if mode == "faf":
+            try:
+                tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
+                val = float(np.atleast_1d(tempo)[0])
+                if val > 0:
+                    bpm = int(round(_octave_correct_bpm(val)))
+                    logger.info("BPM faf: %d (raw: %.1f)", bpm, val)
+                    return bpm, "low"
+            except Exception:
+                pass
+            return None, "low"
+        # mid: 3-method ensemble
+        estimates = _bpm_core_ensemble(y, sr)
+        # sas: additional techniques
+        ibi_cv = 0.5
+        if mode == "sas":
+            # PLP (Predominant Local Pulse)
+            try:
+                onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+                pulse = librosa.beat.plp(onset_envelope=onset_env, sr=sr)
+                plp_ac = librosa.autocorrelate(pulse, max_size=len(pulse))
+                hop = 512
+                min_lag = int(60.0 * sr / (300.0 * hop))
+                max_lag = int(60.0 * sr / (30.0 * hop))
+                max_lag = min(max_lag, len(plp_ac) - 1)
+                if min_lag < max_lag and max_lag > 0:
+                    seg = plp_ac[min_lag:max_lag + 1]
+                    peak_lag = min_lag + np.argmax(seg)
+                    if peak_lag > 0:
+                        plp_bpm = 60.0 * sr / (peak_lag * hop)
+                        if plp_bpm > 0:
+                            estimates.append(_octave_correct_bpm(plp_bpm))
+            except Exception:
+                pass
+            # Multi-hop beat_track (256, 1024)
+            for extra_hop in (256, 1024):
+                try:
+                    tempo_h, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=extra_hop)
+                    val_h = float(np.atleast_1d(tempo_h)[0])
+                    if val_h > 0:
+                        estimates.append(_octave_correct_bpm(val_h))
+                except Exception:
+                    pass
+            # Chunked ensemble
+            chunks = _select_chunks(y, sr, n_chunks=_SAS_NUM_CHUNKS, use_onset=True)
+            for chunk in chunks:
+                chunk_estimates = _bpm_core_ensemble(chunk, sr)
+                estimates.extend(chunk_estimates)
+            # IBI stability
+            try:
+                _, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
+                if beat_frames is not None and len(beat_frames) > 4:
+                    beat_times = librosa.frames_to_time(beat_frames, sr=sr)
+                    ibis = np.diff(beat_times)
+                    ibi_cv = float(np.std(ibis) / (np.mean(ibis) + 1e-10))
+                else:
+                    ibi_cv = 0.5
+            except Exception:
+                ibi_cv = 0.5
+        bpm, confidence = _bpm_consensus(estimates)
+        # sas: IBI stability can upgrade medium->high or downgrade
+        if mode == "sas" and bpm is not None:
+            if ibi_cv < 0.10 and confidence == "medium":
+                confidence = "high"
+            elif ibi_cv > 0.30 and confidence == "high":
+                confidence = "medium"
+        logger.info(
+            "BPM [%s]: %s (estimates=%s, conf=%s)",
+            mode, bpm,
+            [round(e, 1) for e in estimates[:10]],
+            confidence,
+        )
+        return bpm, confidence
+    except Exception as exc:
+        logger.warning("BPM detection failed: %s", exc)
+        return None, "low"
+# ---- Key detection helpers ----------------------------------------------
+def _best_key_for_profile(chroma_avg, major_profile, minor_profile):
+    """Find the best key match for a single profile family.
+    Returns (key_label, correlation).
+    """
+    import numpy as np
+    major_norm = np.array(major_profile, dtype=float)
+    major_norm = major_norm / major_norm.sum()
+    minor_norm = np.array(minor_profile, dtype=float)
+    minor_norm = minor_norm / minor_norm.sum()
+    best_corr = -2.0
+    best_key = "C major"
+    for shift in range(12):
+        rotated = np.roll(chroma_avg, -shift)
+        corr_maj = float(np.corrcoef(rotated, major_norm)[0, 1])
+        if corr_maj > best_corr:
+            best_corr = corr_maj
+            best_key = f"{_PITCH_CLASSES[shift]} major"
+        corr_min = float(np.corrcoef(rotated, minor_norm)[0, 1])
+        if corr_min > best_corr:
+            best_corr = corr_min
+            best_key = f"{_PITCH_CLASSES[shift]} minor"
+    return best_key, best_corr
+def _key_votes_from_chroma(chroma_avg, profiles=None) -> list:
+    """Vote on key from a single chroma vector using specified profiles.
+    Returns list of (key_label, correlation) -- one per profile family.
+    """
+    if profiles is None:
+        profiles = _KEY_PROFILES
+    results = []
+    for name, pf in profiles.items():
+        key_label, corr = _best_key_for_profile(
+            chroma_avg, pf["major"], pf["minor"],
+        )
+        results.append((key_label, corr))
+    return results
+def _energy_weighted_chroma(chroma, y_harmonic):
+    """Compute an energy-weighted average chroma vector.
+    Returns normalized chroma_avg or None if zero energy.
+    """
+    import librosa
+    import numpy as np
+    rms = librosa.feature.rms(y=y_harmonic, frame_length=2048, hop_length=512)
+    rms_vec = rms[0]
+    min_len = min(chroma.shape[1], len(rms_vec))
+    chroma = chroma[:, :min_len]
+    rms_vec = rms_vec[:min_len]
+    weights = rms_vec / (rms_vec.sum() + 1e-10)
+    chroma_avg = (chroma * weights[None, :]).sum(axis=1)
+    s = chroma_avg.sum()
+    if s == 0:
         return None
+    return chroma_avg / s
+# ---- Unified key detection ----------------------------------------------
+def _detect_key(y, sr, mode: str = "faf") -> Tuple[Optional[str], str]:
+    """Detect musical key with quality controlled by mode.
+    faf: Single Krumhansl profile on chroma_cens.
+    mid: 3-profile x energy-weighted chroma_cens x 8s segment voting.
+    sas: mid + multi-chroma fusion + tonnetz + tuning correction +
+         ending resolution + chunked voting.
+    Returns (key, confidence).
+    """
+    import librosa
+    import numpy as np
+    from collections import Counter
+    try:
+        # Harmonic enhancement
+        margin = 4.0 if mode != "faf" else 2.0
+        y_harmonic = librosa.effects.harmonic(y, margin=margin)
+        # sas: tuning correction
+        tuning = 0.0
+        if mode == "sas":
+            try:
+                tuning = float(librosa.estimate_tuning(y=y_harmonic, sr=sr))
+            except Exception:
+                tuning = 0.0
+        # faf: single chroma, single profile
+        if mode == "faf":
+            chroma = librosa.feature.chroma_cens(y=y_harmonic, sr=sr)
+            chroma_avg = _energy_weighted_chroma(chroma, y_harmonic)
+            if chroma_avg is None:
+                return None, "low"
+            kr = _KEY_PROFILES["krumhansl"]
+            key_label, corr = _best_key_for_profile(
+                chroma_avg, kr["major"], kr["minor"],
+            )
+            logger.info("Key faf: %s (corr=%.3f)", key_label, corr)
+            return key_label, "low"
+        # mid / sas: multi-profile voting
+        all_votes = []
+        all_weights = []
+        if mode == "sas":
+            chroma_types = {
+                "cens": lambda: librosa.feature.chroma_cens(
+                    y=y_harmonic, sr=sr, tuning=tuning,
+                ),
+                "cqt": lambda: librosa.feature.chroma_cqt(
+                    y=y_harmonic, sr=sr, tuning=tuning,
+                ),
+                "stft": lambda: librosa.feature.chroma_stft(
+                    y=y_harmonic, sr=sr, tuning=tuning,
+                ),
+            }
+        else:
+            chroma_types = {
+                "cens": lambda: librosa.feature.chroma_cens(
+                    y=y_harmonic, sr=sr,
+                ),
+            }
+        for chroma_name, chroma_fn in chroma_types.items():
+            try:
+                chroma = chroma_fn()
+            except Exception:
+                continue
+            chroma_avg = _energy_weighted_chroma(chroma, y_harmonic)
+            if chroma_avg is None:
+                continue
+            # Global multi-profile vote
+            for key_label, corr in _key_votes_from_chroma(chroma_avg):
+                all_votes.append(key_label)
+                all_weights.append(1.0)
+            # Segment-based voting
+            rms = librosa.feature.rms(
+                y=y_harmonic, frame_length=2048, hop_length=512,
+            )
+            rms_vec = rms[0]
+            min_len = min(chroma.shape[1], len(rms_vec))
+            chroma_s = chroma[:, :min_len]
+            rms_s = rms_vec[:min_len]
+            seg_frames = int(8.0 * sr / 512)
+            n_segments = max(1, chroma_s.shape[1] // seg_frames)
+            for seg_i in range(n_segments):
+                start = seg_i * seg_frames
+                end = min(start + seg_frames, chroma_s.shape[1])
+                seg_chroma = chroma_s[:, start:end]
+                seg_w = rms_s[start:end]
+                w_sum = seg_w.sum()
+                if w_sum < 1e-10:
+                    continue
+                seg_w_norm = seg_w / w_sum
+                seg_avg = (seg_chroma * seg_w_norm[None, :]).sum(axis=1)
+                s = seg_avg.sum()
+                if s < 1e-10:
+                    continue
+                seg_avg = seg_avg / s
+                for key_label, _ in _key_votes_from_chroma(seg_avg):
+                    all_votes.append(key_label)
+                    all_weights.append(1.0)
+        # sas-only extras
+        if mode == "sas":
+            # Tonnetz -- weighted vote for major/minor disambiguation
+            try:
+                tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr)
+                tonnetz_avg = np.mean(tonnetz, axis=1)
+                major_energy = float(np.sum(tonnetz_avg[4:6] ** 2))
+                minor_energy = float(np.sum(tonnetz_avg[2:4] ** 2))
+                tonnetz_ratio = major_energy / (minor_energy + 1e-10)
+                if all_votes:
+                    temp_counts = Counter(all_votes)
+                    leader = temp_counts.most_common(1)[0][0]
+                    leader_is_major = "major" in leader
+                    tonnetz_says_major = tonnetz_ratio > 1.0
+                    if leader_is_major == tonnetz_says_major:
+                        all_votes.extend([leader] * 3)
+                        all_weights.extend([1.5] * 3)
+                    else:
+                        alt_mode = "minor" if leader_is_major else "major"
+                        chroma_cens = librosa.feature.chroma_cens(
+                            y=y_harmonic, sr=sr, tuning=tuning,
+                        )
+                        ca = _energy_weighted_chroma(chroma_cens, y_harmonic)
+                        if ca is not None:
+                            for name, pf in _KEY_PROFILES.items():
+                                prof = np.array(pf[alt_mode], dtype=float)
+                                prof_norm = prof / prof.sum()
+                                best_corr = -2.0
+                                best_k = ""
+                                for shift in range(12):
+                                    rotated = np.roll(ca, -shift)
+                                    c = float(np.corrcoef(rotated, prof_norm)[0, 1])
+                                    if c > best_corr:
+                                        best_corr = c
+                                        best_k = f"{_PITCH_CLASSES[shift]} {alt_mode}"
+                                if best_k:
+                                    all_votes.append(best_k)
+                                    all_weights.append(1.0)
+            except Exception:
+                pass
+            # Ending resolution -- last ~5 s weighted extra
+            try:
+                end_samples = min(int(5.0 * sr), len(y_harmonic))
+                y_end = y_harmonic[-end_samples:]
+                chroma_end = librosa.feature.chroma_cens(
+                    y=y_end, sr=sr, tuning=tuning,
+                )
+                end_avg = np.mean(chroma_end, axis=1)
+                s = end_avg.sum()
+                if s > 1e-10:
+                    end_avg = end_avg / s
+                    for key_label, _ in _key_votes_from_chroma(end_avg):
+                        all_votes.append(key_label)
+                        all_weights.append(2.0)
+            except Exception:
+                pass
+            # Chunked voting
+            chunks = _select_chunks(
+                y_harmonic, sr, n_chunks=_SAS_NUM_CHUNKS, use_onset=False,
+            )
+            for chunk in chunks:
+                try:
+                    ch_chroma = librosa.feature.chroma_cens(
+                        y=chunk, sr=sr, tuning=tuning,
+                    )
+                    ch_avg = _energy_weighted_chroma(ch_chroma, chunk)
+                    if ch_avg is not None:
+                        for key_label, _ in _key_votes_from_chroma(ch_avg):
+                            all_votes.append(key_label)
+                            all_weights.append(1.0)
+                except Exception:
+                    pass
+        # Final weighted majority vote
+        if not all_votes:
+            return None, "low"
+        weighted_counts = {}
+        for vote, w in zip(all_votes, all_weights):
+            weighted_counts[vote] = weighted_counts.get(vote, 0.0) + w
+        best_key = max(weighted_counts, key=weighted_counts.get)
+        total_weight = sum(all_weights)
+        best_weight = weighted_counts[best_key]
+        share = best_weight / total_weight
+        if share >= 0.55:
+            confidence = "high"
+        elif share >= 0.35:
+            confidence = "medium"
+        else:
+            confidence = "low"
+        logger.info(
+            "Key [%s]: %s (share=%.0f%%, votes=%d, conf=%s)",
+            mode, best_key, share * 100, len(all_votes), confidence,
+        )
+        return best_key, confidence
+    except Exception as exc:
+        logger.warning("Key detection failed: %s", exc)
+        return None, "low"
+# ---- Time-signature helpers ---------------------------------------------
+def _timesig_core_scores(y, sr) -> dict:
+    """Compute 3-signal time-signature scores on a single buffer (mid/sas).
+    Returns dict mapping signature labels to raw scores.
+    """
+    import librosa
+    import numpy as np
+    scores = {}
+    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
+    if beat_frames is None or len(beat_frames) < 8:
+        return scores
+    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+    beat_strengths = onset_env[beat_frames[beat_frames < len(onset_env)]]
+    if len(beat_strengths) < 8:
+        return scores
+    # Signal 1: Accent pattern analysis
+    for label, grouping in [("3/4", 3), ("4/4", 4), ("6/8", 6)]:
+        if len(beat_strengths) < grouping * 2:
+            scores[label] = 0.0
+            continue
+        usable = len(beat_strengths) - (len(beat_strengths) % grouping)
+        grouped = beat_strengths[:usable].reshape(-1, grouping)
+        downbeat_mean = float(np.mean(grouped[:, 0]))
+        offbeat_mean = float(np.mean(grouped[:, 1:]))
+        contrast = downbeat_mean / offbeat_mean if offbeat_mean > 0 else 1.0
+        scores[label] = contrast
+    # Signal 2: Autocorrelation at meter periods
+    hop = 512
+    beat_times = librosa.frames_to_time(beat_frames, sr=sr)
+    intervals = np.diff(beat_times)
+    if len(intervals) > 0:
+        median_interval = float(np.median(intervals))
+        beat_period = int(round(median_interval * sr / hop))
+        if beat_period > 0:
+            ac = librosa.autocorrelate(onset_env, max_size=len(onset_env))
+            for label, mult in [("3/4", 3), ("4/4", 4), ("6/8", 6)]:
+                period = beat_period * mult
+                if period < len(ac):
+                    lo = max(0, period - 2)
+                    hi = min(len(ac), period + 3)
+                    ac_score = float(np.mean(ac[lo:hi]))
+                    if ac[0] > 0:
+                        ac_score /= float(ac[0])
+                    scores[label] = scores.get(label, 0.0) + ac_score
+    # Signal 3: Beat-strength variance ratio
+    for label, grouping in [("3/4", 3), ("4/4", 4)]:
+        usable = len(beat_strengths) - (len(beat_strengths) % grouping)
+        if usable >= grouping * 2:
+            grouped = beat_strengths[:usable].reshape(-1, grouping)
+            row_vars = np.var(grouped, axis=1)
+            scores[label] = scores.get(label, 0.0) + float(np.mean(row_vars))
+    return scores
+# ---- Unified time-signature detection -----------------------------------
+def _detect_time_sig(y, sr, mode: str = "faf") -> Tuple[Optional[str], str]:
+    """Estimate time signature with quality controlled by mode.
+    faf: Hardcoded "4/4" (correct ~80%+ of the time).
+    mid: Beat-sync accent + AC + variance + 4/4 prior.
+    sas: mid signals + PLP periodicity + multi-band onset +
+         tempogram harmonic ratios + chunked voting.
+    Returns (signature, confidence).
+    """
+    if mode == "faf":
+        return "4/4", "low"
+    import librosa
+    import numpy as np
+    try:
+        # mid: core 3-signal scoring
+        scores = _timesig_core_scores(y, sr)
+        # sas: additional techniques
+        if mode == "sas":
+            onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+            # PLP periodicity
+            try:
+                pulse = librosa.beat.plp(onset_envelope=onset_env, sr=sr)
+                plp_ac = librosa.autocorrelate(pulse, max_size=len(pulse))
+                tempo_est, _ = librosa.beat.beat_track(y=y, sr=sr)
+                tempo_val = float(np.atleast_1d(tempo_est)[0])
+                if tempo_val > 0:
+                    hop = 512
+                    bp = int(round(60.0 / tempo_val * sr / hop))
+                    if bp > 0:
+                        for label, mult in [("3/4", 3), ("4/4", 4), ("6/8", 6)]:
+                            lag = bp * mult
+                            if lag < len(plp_ac):
+                                lo = max(0, lag - 2)
+                                hi = min(len(plp_ac), lag + 3)
+                                s = float(np.mean(plp_ac[lo:hi]))
+                                if plp_ac[0] > 0:
+                                    s /= float(plp_ac[0])
+                                scores[label] = scores.get(label, 0.0) + s
+            except Exception:
+                pass
+            # Multi-band onset analysis (low/mid/high)
+            try:
+                S = np.abs(librosa.stft(y))
+                n_bins = S.shape[0]
+                third = n_bins // 3
+                bands = {
+                    "low": S[:third, :],
+                    "mid_band": S[third:2*third, :],
+                    "high": S[2*third:, :],
+                }
+                for band_name, band_S in bands.items():
+                    band_onset = librosa.onset.onset_strength(S=band_S, sr=sr)
+                    band_ac = librosa.autocorrelate(
+                        band_onset, max_size=len(band_onset),
+                    )
+                    tempo_val2 = float(np.atleast_1d(tempo_est)[0])
+                    if tempo_val2 > 0:
+                        hop = 512
+                        bp2 = int(round(60.0 / tempo_val2 * sr / hop))
+                        if bp2 > 0 and band_ac[0] > 0:
+                            for label, mult in [("3/4", 3), ("4/4", 4)]:
+                                lag = bp2 * mult
+                                if lag < len(band_ac):
+                                    lo = max(0, lag - 2)
+                                    hi = min(len(band_ac), lag + 3)
+                                    s = float(np.mean(band_ac[lo:hi]))
+                                    s /= float(band_ac[0])
+                                    w = 1.5 if band_name == "low" else 1.0
+                                    scores[label] = scores.get(label, 0.0) + s * w
+            except Exception:
+                pass
+            # Tempogram harmonic ratios
+            try:
+                tempogram = librosa.feature.tempogram(
+                    onset_envelope=onset_env, sr=sr,
+                )
+                avg_tg = np.mean(tempogram, axis=1)
+                bpm_axis = librosa.tempo_frequencies(tempogram.shape[0], sr=sr)
+                if tempo_val > 0:
+                    for mult_label, t_mult in [("duple", 2.0), ("triple", 3.0)]:
+                        target_bpm = tempo_val * t_mult
+                        if target_bpm < 300:
+                            idx = np.argmin(np.abs(bpm_axis - target_bpm))
+                            energy = float(avg_tg[idx])
+                            base_idx = np.argmin(np.abs(bpm_axis - tempo_val))
+                            base_energy = float(avg_tg[base_idx]) + 1e-10
+                            ratio = energy / base_energy
+                            if t_mult == 2.0:
+                                scores["4/4"] = scores.get("4/4", 0.0) + ratio
+                            else:
+                                scores["3/4"] = scores.get("3/4", 0.0) + ratio
+            except Exception:
+                pass
+            # Chunked voting
+            chunks = _select_chunks(y, sr, n_chunks=_SAS_NUM_CHUNKS, use_onset=True)
+            chunk_votes = []
+            for chunk in chunks:
+                cs = _timesig_core_scores(chunk, sr)
+                if cs:
+                    cs["4/4"] = cs.get("4/4", 0.0) * 1.15
+                    best_c = max(cs, key=cs.get)
+                    chunk_votes.append(best_c)
+            for vote in chunk_votes:
+                scores[vote] = scores.get(vote, 0.0) + 1.0
+        # Bayesian prior: bias toward 4/4
+        scores["4/4"] = scores.get("4/4", 0.0) * 1.15
+        if not scores:
+            return "4/4", "low"
+        best = max(scores, key=scores.get)
+        # Confidence: margin between top 2
+        sorted_scores = sorted(scores.values(), reverse=True)
+        if len(sorted_scores) >= 2 and sorted_scores[1] > 0:
+            margin = sorted_scores[0] / sorted_scores[1]
+        else:
+            margin = 1.0
+        if margin > 1.4:
+            confidence = "high"
+        elif margin > 1.15:
+            confidence = "medium"
+        else:
+            confidence = "low"
+        logger.info(
+            "TimeSig [%s]: %s (scores=%s, margin=%.2f, conf=%s)",
+            mode, best,
+            {k: round(v, 3) for k, v in scores.items()},
+            margin, confidence,
+        )
+        return best, confidence
+    except Exception as exc:
+        logger.warning("Time signature detection failed: %s", exc)
+        return "4/4", "low"
 def _sanitize_tag(value: str) -> str:
     return title or audio_path.stem, artist or ""
+def analyze_and_caption(
+    audio_path: str,
+    mode: str = "faf",
+    device: str = "cpu",
+) -> Dict[str, Any]:
     """Analyze an audio file and build a training caption.
+    Supports three quality modes:
+        faf  - CPU, ~2-3s/file.  Single-method detection on raw mix.
+        mid  - ~12s/file.  Demucs stems + 3-method ensemble.
+        sas  - ~30s/file.  Deep multi-technique + chunked analysis.
+    For mid/sas, Demucs separates drums and harmonics stems first.
+    On CPU, Demucs adds ~2-5 minutes per file.
     Args:
         audio_path: Path to the audio file.
+        mode: Analysis mode ("faf", "mid", or "sas").
+        device: Torch device for Demucs ("cpu").
     Returns:
+        Dict with keys: caption, bpm, key, signature, lyrics, title, artist,
+        confidence (dict of per-field confidence levels).
     """
     import librosa
     import numpy as np
     audio_path = Path(audio_path)
+    if mode not in _ANALYSIS_MODES:
+        logger.warning("Unknown analysis mode '%s', falling back to 'faf'", mode)
+        mode = "faf"
     # Load audio once, reuse for all detectors
     try:
         y, sr = librosa.load(str(audio_path), sr=None, mono=True)
             "caption": f"A track by {artist}" if artist else f"A track titled {title}",
             "bpm": None, "key": None, "signature": "4/4",
             "lyrics": "[Instrumental]", "title": title, "artist": artist,
+            "confidence": {},
         }
+    confidence = {}
+    tmp_dir = None
+    try:
+        if mode in ("mid", "sas"):
+            # Demucs stem separation -- run BPM/timesig on drums,
+            # key detection on harmonics
+            tmp_dir = Path(tempfile.mkdtemp(prefix="ace_analysis_"))
+            try:
+                drums_path, harmonics_path = separate_stems(
+                    audio_path, tmp_dir, device=device,
+                )
+                # Load separated stems for analysis
+                y_drums, sr_drums = librosa.load(
+                    str(drums_path), sr=None, mono=True,
+                )
+                y_harmonics, sr_harmonics = librosa.load(
+                    str(harmonics_path), sr=None, mono=True,
+                )
+                # Preprocess stems
+                y_drums_trimmed, _ = librosa.effects.trim(y_drums, top_db=30)
+                if len(y_drums_trimmed) >= sr_drums:
+                    y_drums = y_drums_trimmed
+                peak_d = np.max(np.abs(y_drums))
+                if peak_d > 0:
+                    y_drums = y_drums / peak_d
+                y_harm_trimmed, _ = librosa.effects.trim(y_harmonics, top_db=30)
+                if len(y_harm_trimmed) >= sr_harmonics:
+                    y_harmonics = y_harm_trimmed
+                peak_h = np.max(np.abs(y_harmonics))
+                if peak_h > 0:
+                    y_harmonics = y_harmonics / peak_h
+                # BPM + time sig on drums stem
+                bpm, bpm_conf = _detect_bpm(y_drums, sr_drums, mode)
+                signature, sig_conf = _detect_time_sig(y_drums, sr_drums, mode)
+                # Key on harmonics stem
+                key, key_conf = _detect_key(y_harmonics, sr_harmonics, mode)
+                confidence = {"bpm": bpm_conf, "key": key_conf, "signature": sig_conf}
+            except Exception as exc:
+                logger.warning(
+                    "Demucs separation failed for %s: %s -- "
+                    "falling back to analysis on raw mix",
+                    audio_path.name, exc,
+                )
+                # Fallback: run detectors on raw mix
+                bpm, bpm_conf = _detect_bpm(y, sr, mode)
+                key, key_conf = _detect_key(y, sr, mode)
+                signature, sig_conf = _detect_time_sig(y, sr, mode)
+                confidence = {"bpm": bpm_conf, "key": key_conf, "signature": sig_conf}
+        else:
+            # faf: all detectors on raw mix
+            bpm, bpm_conf = _detect_bpm(y, sr, mode)
+            key, key_conf = _detect_key(y, sr, mode)
+            signature, sig_conf = _detect_time_sig(y, sr, mode)
+            confidence = {"bpm": bpm_conf, "key": key_conf, "signature": sig_conf}
+    finally:
+        if tmp_dir is not None:
+            try:
+                shutil.rmtree(tmp_dir)
+            except OSError as exc:
+                logger.debug("Could not clean temp dir %s: %s", tmp_dir, exc)
     title, artist = _extract_metadata_from_tags(audio_path)
     # Build caption string for ACE-Step training
         "lyrics": lyrics,
         "title": title,
         "artist": artist,
+        "confidence": confidence,
     }
+    logger.info("Auto-caption [%s] for %s: %s", mode, audio_path.name, caption)
     return result
                 if sidecar and sidecar.get("caption"):
                     caption = sidecar["caption"]
                     lyrics = sidecar.get("lyrics", "[Instrumental]")
+                    logger.info("[Caption] %s: using existing sidecar", af.name)
                 else:
+                    # Auto-select analysis mode based on dataset size
+                    if total <= 20:
+                        analysis_mode = "sas"
+                    elif total <= 100:
+                        analysis_mode = "mid"
+                    else:
+                        analysis_mode = "faf"
+                    # Log mode selection with reasoning (first file only)
+                    if i == 0:
+                        _MODE_DESC = {
+                            "faf": "fast, ~3s/file",
+                            "mid": "balanced, ~12s/file",
+                            "sas": "best quality, ~30s/file on GPU, slower on CPU",
+                        }
+                        logger.info(
+                            "[Analysis] Mode auto-selected: '%s' (%s) "
+                            "for %d files (<=20: sas, 21-100: mid, 100+: faf)",
+                            analysis_mode, _MODE_DESC[analysis_mode], total,
+                        )
+                        if analysis_mode in ("mid", "sas") and device == "cpu":
+                            logger.warning(
+                                "[Analysis] Mode '%s' uses Demucs stem separation "
+                                "which is SLOW on CPU (~2-5 min/file). "
+                                "Total estimated time: ~%d-%d min for %d files. "
+                                "Use 'faf' mode or a GPU machine for faster processing.",
+                                analysis_mode,
+                                total * 2, total * 5, total,
+                            )
                     try:
+                        logger.info("[Caption] %s: analyzing (mode=%s)...", af.name, analysis_mode)
+                        analysis = analyze_and_caption(
+                            str(af), mode=analysis_mode, device=device,
+                        )
                         caption = analysis["caption"]
                         lyrics = analysis.get("lyrics", "[Instrumental]")
                         _write_caption_sidecar(af, analysis)
+                        logger.info("[Caption] %s: %s", af.name, caption)
                     except Exception as exc:
+                        logger.warning("[Caption] %s: analysis failed (%s), using filename", af.name, exc)
                         caption = af.stem
                         lyrics = "[Instrumental]"
                 text_prompt = caption