"""Training data generation pipeline for V3 lipsync model.

Produces .npz triples per scenario:
    - audio_features: (T, 141) [mel or wav2vec features — TBD, simple mel for now]
    - conditioning: (T, 19) [16 emotion one-hot + 3 VAD]
    - target: (T, 52) [LAM lipsync + compiler expression merged by channel rules]

Usage:
    python -m scripts.compiler.data_pipeline --limit 10   # test run
    python -m scripts.compiler.data_pipeline              # full run
"""
from __future__ import annotations

import argparse
import asyncio
import json
import logging
from pathlib import Path
from typing import List

import librosa
import numpy as np
import torch

from .constants import (
    LAM_WEIGHTS_SHARED,
    LIPSYNC_ONLY,
    EXPRESSION_ONLY,
    SHARED_CHANNELS,
)
from .expressive import compile_expressive_batch
from .eye_motion import apply_eye_motion
from .lam_wrapper import LAMWrapper
from .tremor import apply_tremor, silence_gate_from_wav
from .tts import synth_all
from .utils import build_synthetic_presets, load_presets_from_json

# ── Persistence rule: maps sub-emotions to 5 base emotions, used to detect
# which turns are "fleeting" (no adjacent same-base turn) vs sustained. Same
# table as scripts/compiler/abc_experiment.py to keep training targets in
# lockstep with the viewer renders.
SUB_TO_BASE = {
    'neutral': 'neutral',
    'joy': 'joy', 'laughter': 'joy', 'excitement': 'joy',
    'agreement': 'joy', 'gratitude': 'joy',
    'sadness': 'sadness', 'crying': 'sadness', 'sulk': 'sadness',
    'apology': 'sadness', 'struggle': 'sadness',
    'anger': 'anger', 'refusal': 'anger',
    'surprise': 'surprise', 'fluster': 'surprise', 'shy': 'surprise',
}

# Brow channels for pass-through-neutral routing during emotional inversions
# (raised → lowered, etc.). Same constants as abc_experiment.py.
BROW_CHANNELS = [0, 1, 2, 3, 4]
BROW_SWING_DELTA = 0.40
NEUTRAL_PAUSE_FRACTION = 0.20

LOG = logging.getLogger("data_pipeline")

PROJECT_ROOT = Path(__file__).resolve().parents[2]
SCENARIOS = PROJECT_ROOT / "data" / "emotion" / "seed_train_final.jsonl"
OUTPUT_DIR = PROJECT_ROOT / "data" / "v3_training"
# Pre-generated audio from scripts.compiler.generate_audio
# Naming convention: {scenario_id}_t{turn_idx}_{emotion}.mp3
AUDIO_DIR = PROJECT_ROOT / "data" / "audio_preview"
FPS = 30
EMOTION_LABELS = [
    "neutral", "joy", "laughter", "excitement", "agreement", "gratitude",
    "sadness", "crying", "sulk", "apology", "struggle",
    "anger", "refusal", "surprise", "fluster", "shy",
]
EMOTION_TO_IDX = {e: i for i, e in enumerate(EMOTION_LABELS)}


async def synth_turns_batch(turns: List[dict], out_dir: Path,
                             backend: str = "elevenlabs", concurrency: int = 4,
                             **tts_kwargs) -> List[Path]:
    """TTS all turns via selected backend, passing emotion+VAD for prosody."""
    out_dir.mkdir(parents=True, exist_ok=True)
    texts = [t["text"] for t in turns]
    emotions = [t.get("emotion") for t in turns]
    vads = [t.get("vad") for t in turns]
    paths = [out_dir / f"{i:06d}.mp3" for i in range(len(turns))]
    ok_flags = await synth_all(
        texts, paths,
        backend=backend,
        concurrency=concurrency,
        emotions=emotions,
        vads=vads,
        **tts_kwargs,
    )
    return [p if ok else None for p, ok in zip(paths, ok_flags)]


def lookup_audio_for_scenario(scen: dict, audio_dir: Path) -> List[Path]:
    """Find pre-generated audio files for a scenario's turns.

    Matches filename pattern: {scenario_id}_t{turn_idx}_{emotion}.mp3
    Returns list aligned to scen['turns']; None if a turn's audio is missing
    or the text is empty.

    For per-turn dialogue splits, audio is named after the *original* scenario
    and turn index, not the new pseudo-scenario id. expand_split_dialogues
    sets `_source_scenario_id` and `_source_turn_indices` so we can reroute
    lookup to the original (sid, ti) here.

    SAFE: never calls TTS, never writes to audio_dir — read-only lookup.
    """
    sid = scen.get("_source_scenario_id", scen["scenario_id"])
    src_tis = scen.get("_source_turn_indices")
    paths: List[Path] = []
    for local_ti, turn in enumerate(scen["turns"]):
        if not turn.get("text", "").strip():
            paths.append(None)
            continue
        actual_ti = src_tis[local_ti] if src_tis is not None else local_ti
        emo = turn.get("emotion", "neutral")
        # Exact match first (scenario_id_tN_emotion.mp3)
        expected = audio_dir / f"{sid}_t{actual_ti}_{emo}.mp3"
        if expected.exists() and expected.stat().st_size > 1000:
            paths.append(expected)
            continue
        # Fall back to glob (emotion label may have changed, or suffix shenanigans)
        matches = list(audio_dir.glob(f"{sid}_t{actual_ti}_*.mp3"))
        matches = [m for m in matches if m.stat().st_size > 1000]
        paths.append(matches[0] if matches else None)
    return paths


def _load_v2_helpers(variant: str):
    """Lazy-load V2 ONNX session + feature extractor and pull the V2-dynamics
    helpers out of abc_experiment.py. Lazy import because abc_experiment.py
    imports `merge_lam_compiler` / `speech_gate` from THIS module — a
    module-level import would cycle.

    Returns a dict with keys: variant, sess, feat, run_v2, apply_v2_dynamics,
    get_preset_envelope.
    """
    import sys as _sys
    import onnxruntime as ort
    _sys.path.insert(0, '/dataset/text-to-face-se/LAM_Audio2Expression')
    from distillation.student_model import AudioFeatureExtractor
    from scripts.compiler.abc_experiment import (
        ONNX_V2, run_v2, apply_v2_dynamics, get_preset_envelope,
    )
    LOG.info(f"Loading V2 ONNX: {ONNX_V2}")
    sess = ort.InferenceSession(ONNX_V2, providers=['CPUExecutionProvider'])
    feat = AudioFeatureExtractor()
    return {
        "variant": variant,
        "sess": sess,
        "feat": feat,
        "run_v2": run_v2,
        "apply_v2_dynamics": apply_v2_dynamics,
        "get_preset_envelope": get_preset_envelope,
    }


def expand_split_dialogues(scenarios: List[dict]) -> List[dict]:
    """Expand each daily_* dialogue scenario into one pseudo-scenario per
    non-empty turn (short monologue per turn).

    Pseudo-scenario shape:
        scenario_id:          "{original_sid}_t{turn_idx}"  (drives .npz name
                                                             + tremor/eye seed)
        turns:                [original turn]                (single-turn —
                                                              no transitions)
        _source_scenario_id:  original sid                   (audio lookup)
        _source_turn_indices: [original turn_idx]            (audio lookup,
                                                              filenames carry
                                                              the original ti)

    Rationale: the blendshape model only consumes (audio, VAD) → face at the
    rendering stage, and contextual emotion learning already lives in
    MicroAlbert (text + previous-turn context). So we drop the dialogue
    structure for this dataset and emit each turn as a self-contained short
    monologue. Long_/solo_ scenarios pass through unchanged — their multi-
    turn structure is what teaches inter-emotion transitions.
    """
    out: List[dict] = []
    for s in scenarios:
        sid = s.get("scenario_id", "")
        if not sid.startswith("daily_"):
            out.append(s)
            continue
        for ti, turn in enumerate(s["turns"]):
            if not turn.get("text", "").strip():
                continue
            out.append({
                "scenario_id": f"{sid}_t{ti}",
                "_source_scenario_id": sid,
                "_source_turn_indices": [ti],
                "turns": [turn],
            })
    return out


def mel_features(wav: np.ndarray, sr: int = 16000, fps: int = 30, n_mels: int = 80) -> np.ndarray:
    """Extract mel features aligned to fps.

    Returns (T, n_mels) float32.
    """
    hop_length = int(sr / fps)
    mel = librosa.feature.melspectrogram(
        y=wav, sr=sr, n_mels=n_mels, hop_length=hop_length, n_fft=1024
    )
    log_mel = librosa.power_to_db(mel).T  # (T, n_mels)
    return log_mel.astype(np.float32)


def build_conditioning(emotion: str, vad: List[float], T: int) -> np.ndarray:
    """(T, 19) conditioning: 16-dim one-hot + 3 VAD, broadcast over frames."""
    cond = np.zeros((T, 19), dtype=np.float32)
    idx = EMOTION_TO_IDX.get(emotion, 0)
    cond[:, idx] = 1.0
    cond[:, 16:] = np.asarray(vad, dtype=np.float32)
    return cond


def speech_gate(lam_bs: np.ndarray) -> np.ndarray:
    """Compute per-frame speech activity [0, 1] from LAM mouth activity.

    Per V3_IMPLEMENTATION_PLAN_v2 §3.4:
        activity = 1.2*jawOpen + 1.5*mouthClose + 1.0*mouthFunnel + 1.0*mouthPucker
    Normalized via sigmoid-ish.
    """
    # Indices in 52-channel order
    activity = (
        1.2 * lam_bs[:, 24]   # jawOpen
        + 1.5 * lam_bs[:, 26] # mouthClose
        + 1.0 * lam_bs[:, 31] # mouthFunnel
        + 1.0 * lam_bs[:, 37] # mouthPucker
    )
    return np.clip(activity / 1.5, 0.0, 1.0).astype(np.float32)


def _brow_pass_through_zero(prev_v: float, next_v: float, t: float) -> float:
    """Brow channel value over a crossfade routed through the neutral (0)
    pose. Used when |delta| > BROW_SWING_DELTA so a sad↔anger inversion
    doesn't slide linearly between extremes.

    Same profile as abc_experiment.py:
        [0, 0.5−PAUSE/2]:  prev → 0 (cosine ramp-down)
        [0.5−PAUSE/2, 0.5+PAUSE/2]:  hold at 0
        [0.5+PAUSE/2, 1]:  0 → next (cosine ramp-up)
    """
    half_pause = NEUTRAL_PAUSE_FRACTION / 2
    if t < 0.5 - half_pause:
        local_t = t / (0.5 - half_pause) if (0.5 - half_pause) > 0 else 1.0
        eased = 0.5 * (1.0 - np.cos(local_t * np.pi))
        return (1.0 - eased) * prev_v
    elif t < 0.5 + half_pause:
        return 0.0
    else:
        denom = (0.5 - half_pause)
        local_t = (t - (0.5 + half_pause)) / denom if denom > 0 else 1.0
        eased = 0.5 * (1.0 - np.cos(local_t * np.pi))
        return eased * next_v


def _one_euro_filter(signal: np.ndarray, fps: float = 30.0,
                     min_cutoff: float = 1.5, beta: float = 0.5,
                     d_cutoff: float = 1.0) -> np.ndarray:
    """One-Euro adaptive low-pass. Peak-preserving smoother for expression
    channels (not lipsync-critical). Same impl as abc_experiment.py."""
    def sf(te, cutoff):
        r = 2.0 * np.pi * cutoff * te
        return r / (r + 1.0)
    T = len(signal)
    out = np.zeros(T, dtype=np.float32)
    out[0] = signal[0]
    dx_prev = 0.0
    te = 1.0 / fps
    for i in range(1, T):
        a_d = sf(te, d_cutoff)
        dx = (signal[i] - out[i - 1]) / te
        dx_hat = a_d * dx + (1.0 - a_d) * dx_prev
        dx_prev = dx_hat
        cutoff = min_cutoff + beta * abs(dx_hat)
        a = sf(te, cutoff)
        out[i] = a * signal[i] + (1.0 - a) * out[i - 1]
    return out


def smooth_expression_channels(target: np.ndarray,
                               min_cutoff: float = 1.5,
                               beta: float = 0.5) -> np.ndarray:
    """Apply One-Euro filter to expression channels (not lipsync-critical)."""
    result = target.copy()
    smooth_ch = sorted(set(EXPRESSION_ONLY) | (set(SHARED_CHANNELS) - {24}))
    for ch in smooth_ch:
        result[:, ch] = _one_euro_filter(result[:, ch],
                                         min_cutoff=min_cutoff, beta=beta)
    return np.clip(result, 0.0, 1.0).astype(np.float32)


def _jitter_gate_smooth(signal: np.ndarray,
                        base_alpha: float = 0.6,
                        jitter_alpha: float = 0.1,
                        jitter_threshold: float = 0.03) -> np.ndarray:
    """V2-style jitter-gate EMA. Small per-frame deltas get heavy smoothing
    (alpha=jitter_alpha); deltas above `jitter_threshold` pass through with
    light smoothing (alpha=base_alpha). Removes sub-threshold mouth jitter
    without flattening real phoneme transitions.

    Mirrors animasync-face-v2/pipeline_v2/smooth_v2.py::jitter_gate_smooth.
    """
    T = len(signal)
    if T <= 1:
        return signal.astype(np.float32, copy=True)
    out = np.zeros(T, dtype=np.float32)
    out[0] = signal[0]
    for t in range(1, T):
        delta = abs(float(signal[t]) - float(signal[t - 1]))
        alpha = base_alpha if delta > jitter_threshold else jitter_alpha
        out[t] = alpha * signal[t] + (1.0 - alpha) * out[t - 1]
    return out


def smooth_lipsync_channels(target: np.ndarray,
                            base_alpha: float = 0.6,
                            jitter_alpha: float = 0.1,
                            jitter_threshold: float = 0.03) -> np.ndarray:
    """Apply V2 jitter-gate smoothing to LIPSYNC_ONLY + jawOpen (ch 24).

    The compiler+LAM teacher target carries sub-threshold high-frequency
    noise in the mouth/jaw channels that V3 then learns and amplifies.
    Smoothing the GT before training removes that noise floor while
    preserving real phoneme onsets (which exceed the jitter threshold).
    """
    result = target.copy()
    lip_ch = sorted(set(LIPSYNC_ONLY) | {24})
    for ch in lip_ch:
        result[:, ch] = _jitter_gate_smooth(
            result[:, ch], base_alpha=base_alpha,
            jitter_alpha=jitter_alpha, jitter_threshold=jitter_threshold,
        )
    return np.clip(result, 0.0, 1.0).astype(np.float32)


def crossfade_turn_boundaries(comp_stack, turn_lengths,
                              fade_frames: int = 96) -> np.ndarray:
    """Cosine-eased blend across turn boundaries, with brow pass-through-zero
    on inverting (large-delta) brow channels. Mirrors abc_experiment.py."""
    concat = np.concatenate(comp_stack, axis=0).astype(np.float32)
    half = fade_frames // 2
    cursor = 0
    for i, Ti in enumerate(turn_lengths[:-1]):
        cursor += Ti
        prev_pose = comp_stack[i][-1]
        next_pose = comp_stack[i+1][0]
        fade_start = max(0, cursor - half)
        fade_end = min(concat.shape[0], cursor + half)
        L = fade_end - fade_start
        if L <= 1:
            continue
        brow_pass_channels = [
            ch for ch in BROW_CHANNELS
            if abs(float(prev_pose[ch]) - float(next_pose[ch])) > BROW_SWING_DELTA
        ]
        for f in range(fade_start, fade_end):
            t = (f - fade_start) / (L - 1)
            alpha = 0.5 * (1.0 - np.cos(t * np.pi))
            concat[f] = (1.0 - alpha) * prev_pose + alpha * next_pose
            for ch in brow_pass_channels:
                concat[f, ch] = _brow_pass_through_zero(
                    prev_pose[ch], next_pose[ch], t
                )
    return concat


def cross_emotion_compile(vads: np.ndarray, presets: dict,
                          sigma: float = 0.4) -> np.ndarray:
    """RBF over ALL preset anchors based on VAD distance — cross-emotion blend.
    Matches abc_experiment.py.cross_emotion_compile."""
    anchor_vads = np.asarray([p['vad'] for p in presets.values()],
                             dtype=np.float32)
    anchor_bs = np.asarray([p['bs'] for p in presets.values()],
                           dtype=np.float32)
    T = vads.shape[0]
    out = np.zeros((T, 52), dtype=np.float32)
    inv_2sig2 = 1.0 / (2.0 * sigma ** 2)
    for t in range(T):
        d2 = np.sum((anchor_vads - vads[t]) ** 2, axis=1)
        w = np.exp(-d2 * inv_2sig2)
        s = w.sum()
        if s > 1e-9:
            w /= s
        out[t] = w @ anchor_bs
    return np.clip(out, 0.0, 1.0).astype(np.float32)


def compute_persistence_scales(turn_emotions: List[str],
                               fleeting_scale: float = 0.65) -> List[float]:
    """For each turn, compute the magnitude scale based on emotion-family
    persistence across adjacent turns. Single-turn scenarios bypass entirely
    (returned scales are all 1.0). Same rule as abc_experiment.py:
       persistence == 1 → fleeting_scale
       persistence == 2 → midpoint(fleeting_scale, 1.0)
       persistence >= 3 → 1.0
    All-same-base monologues also bypass (sustained = full strength).
    """
    n = len(turn_emotions)
    if n <= 1 or fleeting_scale >= 1.0:
        return [1.0] * n
    bases = [SUB_TO_BASE.get(e, "neutral") for e in turn_emotions]
    if len(set(bases)) == 1:
        return [1.0] * n
    paired = 0.5 * (fleeting_scale + 1.0)
    run_len = [0] * n
    i = 0
    while i < n:
        j = i
        while j < n and bases[j] == bases[i]:
            j += 1
        for k in range(i, j):
            run_len[k] = j - i
        i = j
    return [
        fleeting_scale if p == 1 else (paired if p == 2 else 1.0)
        for p in run_len
    ]


def merge_lam_compiler(lam: np.ndarray, comp: np.ndarray, gate: np.ndarray) -> np.ndarray:
    """Merge LAM lipsync + compiler expression per V3 channel rules."""
    T = lam.shape[0]
    out = np.zeros_like(lam)

    # LIPSYNC_ONLY: LAM fully
    for ch in LIPSYNC_ONLY:
        out[:, ch] = lam[:, ch]

    # EXPRESSION_ONLY: compiler fully
    for ch in EXPRESSION_ONLY:
        out[:, ch] = comp[:, ch]

    # SHARED: blend with speech gate
    for ch in SHARED_CHANNELS:
        w = LAM_WEIGHTS_SHARED[ch]
        if ch == 24:  # jawOpen — arousal as gain, not additive
            emotion_gain = 1.0 + comp[:, ch] * 0.5 * (1.0 - gate)
            out[:, ch] = lam[:, ch] * emotion_gain + comp[:, ch] * (1 - gate) * 0.3
        else:
            blended_active = w * lam[:, ch] + (1 - w) * comp[:, ch]
            out[:, ch] = gate * blended_active + (1 - gate) * comp[:, ch]

    # Correctives: smile suppresses funnel/frown (§3.4)
    smile = (out[:, 43] + out[:, 44]) * 0.5
    out[:, 31] *= (1 - smile * 0.8)
    out[:, 29] *= (1 - smile * 0.9)
    out[:, 30] *= (1 - smile * 0.9)

    return np.clip(out, 0.0, 1.0).astype(np.float32)


def process_scenario(
    scenario: dict,
    audio_paths: List[Path],
    lam: LAMWrapper,
    presets: dict,
    out_path: Path,
    persistence_damping: float = 0.65,
    cross_emotion_weight: float = 0.2,
    cross_emotion_sigma: float = 0.4,
    vad_damp_gamma: float = 0.3,
    vad_damp_beta: float = 0.7,
    vad_smooth_sigma: float = 30.0,
    fade_frames: int = 96,
    blink_interval_s: float = 3.5,
    option_e_intensity: float = 1.0,
    tremor_amp: float = 0.014,
    tremor_sigma: float = 1.5,
    v2_helpers: dict = None,
    lipsync_smooth: bool = False,
    lipsync_smooth_alpha: float = 0.6,
    lipsync_smooth_jitter_alpha: float = 0.1,
    lipsync_smooth_threshold: float = 0.03,
) -> bool:
    """Two-pass scenario → .npz, mirroring abc_experiment.py's variant-C
    pipeline (compiler + LAM, no V2 ONNX). Defaults match the canonical
    lock-in (damp 0.65, xemo 0.2, blink 3.5, fade 96, σ=30, γ=0.3) — running
    with no flag overrides produces training targets that match what the
    viewer shows for _d65x20 scenarios.

    Pass 1: collect per-turn audio + LAM + speech gate + raw VAD.
    Persistence rule: pose-level scale per turn (multi-turn only).
    Causal VAD damping: pull each turn's VAD toward running mean of past.
    Cross-turn VAD smoothing: σ-frame Gaussian over per-frame VAD trajectory.
    Pass 2: per-turn compile (within-emotion + cross-emotion blend, then
    persistence damp).
    Crossfade between turn boundaries.
    Merge LAM, smooth expression channels, apply eye_motion.
    """
    # ── Pass 1 ───────────────────────────────────────────────────
    collected = []
    for turn_idx, turn in enumerate(scenario["turns"]):
        if not turn["text"].strip():
            continue
        audio_path = audio_paths[turn_idx]
        if audio_path is None or not audio_path.exists():
            continue
        wav, sr = librosa.load(str(audio_path), sr=16000, mono=True)
        if len(wav) < 16000 * 0.1:
            continue
        lam_bs = lam.infer_audio(audio_path)
        T = lam_bs.shape[0]
        mel = mel_features(wav, sr=sr, fps=FPS)
        if mel.shape[0] > T:
            mel = mel[:T]
        elif mel.shape[0] < T:
            pad = T - mel.shape[0]
            mel = np.concatenate([mel, np.tile(mel[-1:], (pad, 1))], axis=0)
        gate = speech_gate(lam_bs)
        # Audio-derived silence gate (for tremor) — soft per-frame indicator
        # of "not speaking", aligned to lam_bs T. σ=6 smoothing matches the
        # player's runtime tremor gate exactly.
        sgate = silence_gate_from_wav(wav, sr, T, fps=FPS)
        # V2 ONNX inference (teacher) — only for variant A/B. Produces (T, 52)
        # in AnimaSync ordering; trimmed/padded to LAM T.
        v2_bs = None
        if v2_helpers is not None:
            v2_bs = v2_helpers["run_v2"](
                v2_helpers["sess"], v2_helpers["feat"], wav, turn["emotion"]
            )
            if v2_bs.shape[0] > T:
                v2_bs = v2_bs[:T]
            elif v2_bs.shape[0] < T:
                pad = T - v2_bs.shape[0]
                v2_bs = np.concatenate(
                    [v2_bs, np.tile(v2_bs[-1:], (pad, 1))], axis=0
                )
        collected.append({
            "turn_idx": turn_idx,
            "emotion": turn["emotion"],
            "vad": list(turn["vad"]),
            "T": T,
            "lam_bs": lam_bs,
            "mel": mel,
            "gate": gate,
            "silence_gate": sgate,
            "v2_bs": v2_bs,
        })
    if not collected:
        return False

    # ── Persistence rule ─────────────────────────────────────────
    persist_scales = compute_persistence_scales(
        [c["emotion"] for c in collected],
        fleeting_scale=persistence_damping,
    )

    # ── Causal VAD damping (multi-turn only) ─────────────────────
    if vad_damp_gamma > 0 and len(collected) > 1:
        γ = float(vad_damp_gamma)
        β = float(vad_damp_beta)
        running_mean = np.array(collected[0]["vad"], dtype=np.float32)
        for c in collected[1:]:
            raw = np.array(c["vad"], dtype=np.float32)
            damped = γ * raw + (1.0 - γ) * running_mean
            running_mean = β * running_mean + (1.0 - β) * raw
            c["vad"] = damped.tolist()

    # ── Per-frame VAD trajectory + cross-turn smoothing ──────────
    all_vads = np.concatenate([
        np.tile(np.asarray(c["vad"], dtype=np.float32), (c["T"], 1))
        for c in collected
    ], axis=0)
    # Per-frame emotion one-hot trajectory (16-dim), also smoothed.
    # Without this, the saved `cond` field had step-changing emotion + VAD at
    # turn boundaries while the target was smoothed — V3 couldn't learn the
    # crossfade because its input was discontinuous where its output was smooth.
    n_total = sum(c["T"] for c in collected)
    all_emos = np.zeros((n_total, len(EMOTION_LABELS)), dtype=np.float32)
    _c_emo = 0
    for c in collected:
        idx = EMOTION_TO_IDX.get(c["emotion"], 0)
        all_emos[_c_emo:_c_emo + c["T"], idx] = 1.0
        _c_emo += c["T"]
    if vad_smooth_sigma > 0 and len(collected) > 1:
        from scipy.ndimage import gaussian_filter1d
        all_vads = gaussian_filter1d(
            all_vads, sigma=vad_smooth_sigma, axis=0, mode="nearest"
        ).astype(np.float32)
        all_emos = gaussian_filter1d(
            all_emos, sigma=vad_smooth_sigma, axis=0, mode="nearest"
        ).astype(np.float32)

    # ── Pass 2: per-turn compile + cross-emotion blend + persistence damp ──
    comp_stack = []
    audio_feats, conds = [], []
    cursor = 0
    turn_lengths = []
    for c, ps in zip(collected, persist_scales):
        T = c["T"]
        emo = c["emotion"]
        vad_slice = all_vads[cursor:cursor + T]
        emo_slice = all_emos[cursor:cursor + T]
        cursor += T

        comp_bs = compile_expressive_batch(
            emotions=[emo] * T,
            vads=vad_slice,
            presets=presets,
            parametric_overlay_intensity=option_e_intensity,
        )
        if cross_emotion_weight > 0.0:
            xemo = cross_emotion_compile(vad_slice, presets,
                                          sigma=cross_emotion_sigma)
            w = float(cross_emotion_weight)
            comp_bs = ((1.0 - w) * comp_bs + w * xemo).astype(np.float32)
        if ps < 1.0:
            comp_bs = (comp_bs * ps).astype(np.float32)

        # V2-dynamics overlay (variant A/B). Bandpass-filtered V2 motion is
        # added to comp_bs on the variant's channel mask, then capped to the
        # emotion's preset envelope so peaks never exceed user-authored
        # intensity range.
        if v2_helpers is not None and c.get("v2_bs") is not None:
            env_lo, env_hi = v2_helpers["get_preset_envelope"](emo, presets)
            comp_bs = v2_helpers["apply_v2_dynamics"](
                comp_bs, c["v2_bs"], v2_helpers["variant"],
                envelope_lo=env_lo, envelope_hi=env_hi,
            )

        comp_stack.append(comp_bs)
        turn_lengths.append(T)
        audio_feats.append(c["mel"])
        # Use per-frame smoothed VAD + smoothed emotion one-hot — gives the
        # student the same smooth conditioning signal the compiler uses to
        # generate the smooth target, so the model can actually learn the
        # crossfade across turn boundaries.
        cond_per_frame = np.concatenate(
            [emo_slice, vad_slice.astype(np.float32)], axis=-1
        ).astype(np.float32)
        conds.append(cond_per_frame)

    # ── Crossfade boundaries (multi-turn only) ───────────────────
    is_monologue = len(turn_lengths) >= 3
    fade_for_this = fade_frames if is_monologue else min(fade_frames, 8)
    if len(comp_stack) > 1:
        comp_cat = crossfade_turn_boundaries(comp_stack, turn_lengths,
                                              fade_frames=fade_for_this)
    else:
        comp_cat = comp_stack[0]

    # ── Merge LAM ─────────────────────────────────────────────────
    lam_cat = np.concatenate([c["lam_bs"] for c in collected], axis=0)
    gate_cat = np.concatenate([c["gate"] for c in collected], axis=0)
    target = merge_lam_compiler(lam_cat, comp_cat, gate_cat)

    # ── One-Euro smooth expression channels ──────────────────────
    min_cutoff = 0.8 if is_monologue else 1.5
    target = smooth_expression_channels(target, min_cutoff=min_cutoff, beta=0.5)

    # ── Jitter-gate smooth lipsync channels (V2-style, opt-in) ──
    # Removes the sub-threshold mouth noise the student picks up and
    # amplifies. Leaves real phoneme transitions (Δ > threshold) intact.
    if lipsync_smooth:
        target = smooth_lipsync_channels(
            target,
            base_alpha=lipsync_smooth_alpha,
            jitter_alpha=lipsync_smooth_jitter_alpha,
            jitter_threshold=lipsync_smooth_threshold,
        )

    # ── Eye motion (deterministic per scenario, custom blink interval) ──
    target = apply_eye_motion(
        target,
        seed_str=scenario["scenario_id"],
        fps=FPS,
        blink_interval_s=blink_interval_s,
    )

    # ── Bake brow + eyeSquint tremor (matches viewer runtime tremor) ─────
    # Deterministic per scenario_id so regeneration is reproducible.
    if tremor_amp > 0.0 and tremor_sigma > 0.0:
        sgate_cat = np.concatenate(
            [c["silence_gate"] for c in collected], axis=0
        ).astype(np.float32)
        target = apply_tremor(
            target,
            silence_gate=sgate_cat,
            scenario_id=scenario["scenario_id"],
            amp=tremor_amp,
            sigma=tremor_sigma,
        )

    np.savez_compressed(
        out_path,
        audio=np.concatenate(audio_feats, axis=0),
        cond=np.concatenate(conds, axis=0),
        target=target,
    )
    return True


async def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--scenarios", type=Path, default=SCENARIOS)
    ap.add_argument("--output", type=Path, default=OUTPUT_DIR)
    ap.add_argument("--audio_dir", type=Path, default=AUDIO_DIR,
                    help="Directory with pre-generated audio (read-only lookup)")
    ap.add_argument("--limit", type=int, default=0, help="0 = all")
    ap.add_argument("--presets_json", type=Path,
                    default=PROJECT_ROOT / "expression_presets.json",
                    help="User-authored preset JSON. Defaults to "
                         "<project_root>/expression_presets.json (the same "
                         "file abc_experiment.py uses). Falls back to "
                         "synthetic bootstrap if the file doesn't exist.")
    ap.add_argument("--device", default=None)
    ap.add_argument("--filter-prefix", default="long_,solo_",
                    help="Comma-separated scenario_id prefixes to keep. "
                         "Default 'long_,solo_' (monologues + single-turn). "
                         "Use 'all' or '' to disable filtering and process every scenario. "
                         "When --split-dialogues is set, 'daily_' is auto-added if missing.")
    ap.add_argument("--split-dialogues", action="store_true",
                    help="Expand each daily_* dialogue into one .npz per turn "
                         "(short monologue per turn). Audio lookup is rerouted "
                         "to the original (sid, turn_idx). Dialogue context "
                         "learning is left to MicroAlbert; this pipeline only "
                         "consumes (audio, VAD) → face at the render stage.")
    # Pipeline parameters — defaults match the canonical _d65x20 viewer renders.
    ap.add_argument("--persistence-damping", type=float, default=0.65,
                    help="Pose-level scale for fleeting (multi-turn isolated) "
                         "emotions. Single-turn scenarios bypass. Default 0.65.")
    ap.add_argument("--cross-emotion-weight", type=float, default=0.2,
                    help="Weight for cross-emotion VAD-distance blend "
                         "(0=pure within-emotion, 1=pure cross). Default 0.2.")
    ap.add_argument("--cross-emotion-sigma", type=float, default=0.4,
                    help="Gaussian σ for cross-emotion VAD-distance kernel. Default 0.4.")
    ap.add_argument("--vad-damp-gamma", type=float, default=0.3,
                    help="Causal VAD damping γ. Default 0.3.")
    ap.add_argument("--vad-damp-beta", type=float, default=0.7,
                    help="Causal VAD damping β. Default 0.7.")
    ap.add_argument("--vad-smooth-sigma", type=float, default=30.0,
                    help="Cross-turn VAD trajectory Gaussian σ in frames. Default 30.")
    ap.add_argument("--fade-frames", type=int, default=96,
                    help="Crossfade duration at turn boundaries (monologue). Default 96.")
    ap.add_argument("--blink-interval", type=float, default=3.5,
                    help="Mean seconds between blinks (Poisson). Default 3.5.")
    ap.add_argument("--option-e-intensity", type=float, default=1.0,
                    help="α scalar for Option E parametric mouth/cheek overlay. Default 1.0.")
    ap.add_argument("--tremor-amp", type=float, default=0.014,
                    help="Brow + eyeSquint tremor amplitude. Default 0.014 "
                         "(matches viewer runtime tremor). Set 0 to disable.")
    ap.add_argument("--tremor-sigma", type=float, default=1.5,
                    help="Gaussian σ for tremor noise smoothing (frames). "
                         "Default 1.5 → ~2.2 Hz dominant.")
    ap.add_argument("--no-tremor", action="store_true",
                    help="Disable tremor baking entirely (clean targets).")
    ap.add_argument("--lipsync-smooth", action="store_true",
                    help="Apply V2-style jitter-gate EMA to LIPSYNC_ONLY + "
                         "jawOpen channels of the teacher target before saving. "
                         "Removes sub-threshold mouth noise V3 would otherwise "
                         "learn. Mirrors animasync-face-v2/pipeline_v2/smooth_v2.py.")
    ap.add_argument("--lipsync-smooth-alpha", type=float, default=0.6,
                    help="EMA alpha for above-threshold frame deltas. Default 0.6.")
    ap.add_argument("--lipsync-smooth-jitter-alpha", type=float, default=0.1,
                    help="EMA alpha for sub-threshold (jitter) frame deltas. "
                         "Lower = more smoothing. Default 0.1.")
    ap.add_argument("--lipsync-smooth-threshold", type=float, default=0.03,
                    help="Frame-delta cutoff: |Δ|>threshold → real motion, "
                         "|Δ|<=threshold → jitter. Default 0.03.")
    ap.add_argument("--variant", choices=["A", "B", "C"], default="B",
                    help="V2-dynamics teacher variant. "
                         "C = compiler only (no V2). "
                         "A = strict V2 mask (brows + cheek/nose squint + eyeSquint). "
                         "B = tiered V2 mask: A's channels PLUS mouth smile/frown, "
                         "eye wide, mouth dimple at α=0.25. "
                         "Default B — V3 learns to reproduce V2's prosody-driven "
                         "motion from (audio, VAD) alone. V2 ONNX is loaded only "
                         "for A/B (data generation only; not used at V3 inference).")
    args = ap.parse_args()
    if args.no_tremor:
        args.tremor_amp = 0.0

    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
    args.output.mkdir(parents=True, exist_ok=True)

    # Safety: audio_dir must exist and be read-only for this run.
    if not args.audio_dir.exists():
        raise SystemExit(f"Audio dir not found: {args.audio_dir}")
    LOG.info(f"Audio source (read-only): {args.audio_dir}")

    # Presets
    if args.presets_json and args.presets_json.exists():
        LOG.info(f"Loading user presets from {args.presets_json}")
        presets = load_presets_from_json(args.presets_json)
    else:
        LOG.info("Using synthetic preset bootstrap (parametric layer on anchors)")
        presets = build_synthetic_presets()
    LOG.info(f"  → {len(presets)} presets")

    # Load scenarios
    scenarios = []
    with args.scenarios.open() as f:
        for line in f:
            scenarios.append(json.loads(line))
    LOG.info(f"Loaded {len(scenarios)} scenarios (pre-filter)")

    # Filter by scenario_id prefix (default keeps long_* + solo_*, drops daily_*).
    prefix_str = args.filter_prefix.strip().lower()
    if prefix_str and prefix_str not in ("all", "*"):
        prefixes = tuple(p.strip() for p in prefix_str.split(",") if p.strip())
        if args.split_dialogues and not any(p.startswith("daily") for p in prefixes):
            prefixes = prefixes + ("daily_",)
            LOG.info("--split-dialogues set: auto-added 'daily_' to filter prefix")
        before = len(scenarios)
        scenarios = [s for s in scenarios
                     if s.get("scenario_id", "").startswith(prefixes)]
        LOG.info(f"Filter prefixes={prefixes}: {before} → {len(scenarios)} scenarios")

    # Expand daily_* dialogues into per-turn short-monologue pseudo-scenarios.
    # Long_/solo_ pass through unchanged. dataset_to_viewer.py resolves the
    # split scenario IDs on the fly via regex on the `_t<idx>` suffix, so we
    # no longer write a sidecar JSONL here (parallel runs racing on the same
    # `seed_split_dialogues.jsonl` corrupted it).
    if args.split_dialogues:
        before = len(scenarios)
        scenarios = expand_split_dialogues(scenarios)
        n_split = sum(1 for s in scenarios if s.get("_source_scenario_id"))
        LOG.info(f"--split-dialogues: {before} → {len(scenarios)} scenarios "
                 f"({n_split} per-turn splits from dialogues)")

    if args.limit:
        scenarios = scenarios[: args.limit]
        LOG.info(f"--limit applied: {len(scenarios)} scenarios")

    # Look up pre-generated audio for every scenario — NO TTS, read-only.
    scenario_audio_paths = []
    missing_total = 0
    for scen in scenarios:
        paths = lookup_audio_for_scenario(scen, args.audio_dir)
        scenario_audio_paths.append(paths)
        missing_total += sum(
            1 for ti, p in enumerate(paths)
            if p is None and scen["turns"][ti].get("text", "").strip()
        )
    total_turns = sum(
        sum(1 for t in s["turns"] if t.get("text", "").strip())
        for s in scenarios
    )
    found_turns = total_turns - missing_total
    LOG.info(f"Audio lookup: {found_turns}/{total_turns} turns have audio "
             f"({missing_total} missing — those turns will be skipped)")
    if found_turns == 0:
        raise SystemExit("No audio found for any scenario. Check --audio_dir.")

    # Load LAM
    LOG.info("Loading LAM model...")
    lam = LAMWrapper(device=args.device)

    # Load V2 teacher (only for variant A/B — data generation only, never
    # invoked at V3 inference time).
    v2_helpers = None
    if args.variant != "C":
        v2_helpers = _load_v2_helpers(args.variant)
        LOG.info(f"V2 teacher ENABLED — variant={args.variant}")
    else:
        LOG.info("V2 teacher DISABLED (variant=C, compiler-only targets)")

    # Process scenarios sequentially (LAM GPU-bound)
    from tqdm import tqdm
    success = 0
    for si, scen in enumerate(tqdm(scenarios, desc="Process scenarios")):
        out_path = args.output / f"{scen['scenario_id']}.npz"
        if out_path.exists():
            success += 1
            continue
        ok = process_scenario(
            scen, scenario_audio_paths[si], lam, presets, out_path,
            persistence_damping=args.persistence_damping,
            cross_emotion_weight=args.cross_emotion_weight,
            cross_emotion_sigma=args.cross_emotion_sigma,
            vad_damp_gamma=args.vad_damp_gamma,
            vad_damp_beta=args.vad_damp_beta,
            vad_smooth_sigma=args.vad_smooth_sigma,
            fade_frames=args.fade_frames,
            blink_interval_s=args.blink_interval,
            option_e_intensity=args.option_e_intensity,
            tremor_amp=args.tremor_amp,
            tremor_sigma=args.tremor_sigma,
            v2_helpers=v2_helpers,
            lipsync_smooth=args.lipsync_smooth,
            lipsync_smooth_alpha=args.lipsync_smooth_alpha,
            lipsync_smooth_jitter_alpha=args.lipsync_smooth_jitter_alpha,
            lipsync_smooth_threshold=args.lipsync_smooth_threshold,
        )
        if ok:
            success += 1

    LOG.info(f"Done. {success}/{len(scenarios)} scenarios successfully processed.")
    LOG.info(f"Output: {args.output}")


if __name__ == "__main__":
    asyncio.run(main())
