"""V3 face model configuration.

Hyperparameters chosen to land at ~3M params (~12 MB fp32, ~3 MB int8) with
~1 ms/frame CPU inference. See architecture sketch in models/v3_face/model.py.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Tuple


@dataclass
class V3FaceConfig:
    # ─── Input / output dims ─────────────────────────────────────────
    audio_dim: int = 80           # log-mel features per frame (from .npz)
    cond_dim: int = 19            # 16 emotion one-hot + 3 VAD (per-frame)
    output_dim: int = 52          # ARKit blendshape channels

    # ─── Architecture (split-branch causal TCN with FiLM conditioning) ───
    # Shared backbone learns audio features useful for both lipsync and
    # expression. Two branches diverge after the shared backbone, each with
    # its own dilated TCN stack + Linear head. This lets phase-2 retraining
    # freeze the lipsync side (shared backbone + lipsync branch + head)
    # while expression branch + head continue to train — true lipsync lock.
    hidden_dim: int = 192
    kernel_size: int = 5
    # Shared backbone — early local-to-mid range temporal context
    shared_dilations: Tuple[int, ...] = (1, 2, 4, 8, 16, 32)
    # Per-branch — long-range refinement, run independently per output group
    branch_dilations: Tuple[int, ...] = (64, 128)
    dropout: float = 0.1

    # ─── Training ────────────────────────────────────────────────────
    fps: int = 30
    crop_frames: int = 240        # 8 seconds per training sample
    batch_size: int = 32
    learning_rate: float = 1e-3
    weight_decay: float = 0.01
    warmup_steps: int = 500
    n_epochs: int = 50
    grad_clip: float = 1.0

    # Long_ scenarios are ~6% of file count but the only source of multi-
    # emotion transitions. Weighted sampling boosts their gradient share.
    long_oversample_weight: float = 5.0

    # ─── Target-side gain (stronger expression / lipsync) ────────────
    # Multiplies the .npz target on selected channels at load time, then
    # clamps to [0, 1]. The model learns to produce stronger output. No
    # data regeneration required — pure dataloader knob. Default 1.0 = off,
    # preserves prior training behavior bit-exactly.
    #   lipsync_target_gain  → PURE_LIPSYNC channels: jaw, cheekPuff,
    #     mouthClose/Funnel/Left/LowerDown/Press/Pucker/Right/Roll/Shrug/
    #     Stretch/UpperUp, tongueOut. (Pure mouth/jaw mechanics.)
    #   expression_target_gain → EMOTIONAL channels: brows incl. innerUp,
    #     cheekSquint, eyeSquint, eyeWide, mouthDimple, mouthFrown,
    #     mouthSmile, noseSneer. (Channels carrying emotional valence.)
    # Skipped on purpose: eyeBlink (already correct natural magnitude),
    # eyeLook (gaze direction, not emotion).
    lipsync_target_gain: float = 1.0
    expression_target_gain: float = 1.0
    # Optional split: if set to a different value, mouth (smile/frown/dimple/
    # sneer) gets THIS gain while the rest of expression gets the value
    # above. None → same gain (backward compat with v14/v18). v18b uses
    # 1.4 here + 1.8 above to keep strong brows without breaking lipsync
    # under crisp_mouth's per-channel normalization.
    emotional_mouth_target_gain: float | None = None

    # ─── Loss weighting ──────────────────────────────────────────────
    # Per-channel L1 weights. Lipsync channels matter most (audio-sync is
    # the most visually obvious failure mode). Eye-blink channels get low
    # weight because blinks are sparse / sharp and hard to learn from audio.
    lipsync_weight: float = 2.0
    expression_weight: float = 1.0
    eye_blink_weight: float = 0.3   # ch 8, 9 (eyeBlinkLeft / eyeBlinkRight)

    # Smoothness penalty — L1 on per-frame velocity, weighted per channel.
    # Lipsync (mouth/jaw) needs HEAVY smoothing — those channels show jitter
    # most visibly. Brows want LIGHT smoothing — V2 prosody motion lives
    # there and over-smoothing would kill the audio-driven brow life.
    # Eye blink channels intentionally have NEAR-ZERO velocity penalty so
    # blinks remain sharp 5-frame transients (not smoothed away).
    # Per-channel velocity weights. Two prior attempts collapsed:
    #   - 0.8/0.6/0.5/0.05 (avg 0.63, 2.1× baseline) — stuck for full run
    #   - 0.5/0.4/0.35/0.05 (avg 0.41, 1.37× baseline) — partial recovery only
    # The model can't escape the near-constant local minimum when total
    # velocity pressure exceeds ~1.2× the previously-working uniform 0.3.
    # These values average to 0.30 — essentially baseline — with a very
    # modest lipsync bump (1.17×) and blinks preserved.
    # Velocity warmup epochs: velocity loss starts at 0, ramps linearly to
    # full strength over this many epochs. Lets the model learn to fit L1
    # (varied output) BEFORE velocity penalty kicks in, avoiding the
    # near-constant-output local minimum that trapped prior attempts.
    velocity_warmup_epochs: int = 20

    # With warmup, we can use stronger velocity weights without collapse.
    velocity_lipsync_weight: float = 0.7      # mouth/jaw — 2.3× baseline
    velocity_shared_weight: float = 0.55      # jawOpen, mouth corners
    velocity_expression_weight: float = 0.45  # brows, cheek — 1.5× baseline
    velocity_eye_blink_weight: float = 0.05   # blinks — minimal so they stay sharp
    # eyeSquint (ch 18, 19) special case. Real orbicularis oculi is slow
    # and sustained; at high expression gain the model picks up frame-to-
    # frame noise in the targets and looks "jittery". Higher velocity weight
    # specifically on these two channels suppresses that without affecting
    # blinks or other expression channels. None → fall back to
    # velocity_expression_weight (backward compat with v14/v18).
    velocity_eye_squint_weight: float | None = None
    # brows (ch 0–4) special case. Same story as eyeSquint — at high gain,
    # the model picks up frame-to-frame target noise and the brows visibly
    # flicker. Bumping the per-channel velocity weight on the five brow
    # channels suppresses the jitter. None → fall back to
    # velocity_expression_weight (backward compat with v14/v18/v18b).
    velocity_brow_weight: float | None = None
    # (legacy single-knob kept for backward compat in older configs)
    velocity_weight: float = 0.3

    # Plosive damper baked into the dataset target. Mirrors the runtime
    # "plosive damper" slider in the viewer: when mouthClose > 0.4 on a
    # given frame, mouthPress/Roll/Shrug (ch 35,36,39,40,41,42) get
    # multiplied by (1 - plosive_damp_target * smoothstep(mouthClose)),
    # killing the "lips swallowed" stack on m/b/p plosives. 0 = no damping
    # (default, backward compat). Applied AFTER per-channel gain in the
    # dataset, BEFORE the [0,1] clamp.
    plosive_damp_target: float = 0.0

    # ─── Target pre-smoothing (per-channel Gaussian, applied BEFORE gain) ─
    # The "velocity_*" knobs above are velocity-MATCHING losses
    # (|pred_v - target_v|), not output-smoothness penalties — so they
    # propagate any target jitter straight into model output. To genuinely
    # smooth a channel family we have to smooth the TARGETS so the model
    # sees clean trajectories AND its velocity-matching loss matches a
    # clean velocity profile. Sigma is in frames at 30 fps; σ=2 ≈ 67 ms,
    # σ=3 ≈ 100 ms. 0 = no smoothing (default, backward compat).
    smooth_target_sigma_brow: float = 0.0        # ch 0–4
    smooth_target_sigma_eye_squint: float = 0.0  # ch 18, 19
    # eyeWide (ch 20, 21) — paired with eyeSquint at turn boundaries
    # (surprise → other). Without target smoothing the anti-correlated
    # ramp-down-while-other-ramps-up looks discontinuous. Same σ scale
    # as eyeSquint typically.
    smooth_target_sigma_eye_wide: float = 0.0    # ch 20, 21

    # ─── Per-emotion gain override for browInnerUp (ch 2) ────────────────
    # browInnerUp is the "concerned/distress" inner-brow raise. At high
    # expression_target_gain it gets pushed up even on positive emotions,
    # making joy look sad/apologetic. This knob lets browInnerUp use a
    # DIFFERENT gain on happy-family emotions (joy / laughter / excitement
    # / gratitude) than on the rest. Typical: 1.0 (no gain on happy) while
    # expression_target_gain stays high for everything else.
    # None → no override (browInnerUp follows expression_target_gain
    # everywhere, backward compat with v14..v18f).
    brow_innerup_happy_gain: float | None = None

    # ─── Per-emotion gain override for ALL brow channels on happy frames ─
    # Same pattern as `brow_innerup_happy_gain` but covers ch 0–4 (the full
    # brow group), not just browInnerUp. Lets brows max out on sad/angry/
    # surprise frames while staying at a v14-style calm gain (~1.4) on
    # joy / laughter / excitement / gratitude. Use this when you also want
    # `brow_target_gain` at a hard-max value (e.g. 10) for non-happy
    # emotions but need to hold brows back on happy frames so smiles don't
    # look concerned/apologetic. None → no override (backward compat).
    brow_happy_gain: float | None = None

    # ─── Per-channel-group override for ALL brow channels (ch 0–4) ───────
    # Splits brows out of `expression_target_gain`. When the rest of the
    # expression channels (cheekSquint, eyeSquint, eyeWide) want to be
    # baked stronger (e.g. ×6 to capture a "viewer-slider-at-3.5×" look
    # without runtime amplification), brows would otherwise come along and
    # make every joy frame look concerned. This knob lets brows ride at a
    # v14-style mild gain (e.g. 1.4) while the rest of expression goes
    # much higher. None → brows follow expression_target_gain (backward
    # compat with v14..v18g).
    brow_target_gain: float | None = None

    # ─── Per-frame brow scale-down on high-eyeWide (surprise/fluster) ────
    # At very high `brow_target_gain`, surprise/fluster frames push the brows
    # so far up the lift itself looks worse than the surprise. This knob
    # mirrors the viewer's "brow cap" slider exactly: uses the per-frame
    # POST-gain eyeWide value as a surprise proxy and scales brow channels
    # (0–4) down by `1 - wide_ramp * brow_surprise_gain`, where
    # `wide_ramp = clip((max(eyeWide_L, eyeWide_R) - 0.10) / 0.20, 0, 1)`.
    # Equivalent to baking the viewer's `browCapStrength` value into the
    # training target. None → no scale-down (backward compat). Typical:
    # 0.3–0.5.
    brow_surprise_gain: float | None = None

    # ─── Hard cap on eyeWide channels (ch 20, 21) after gain ─────────────
    # At very high expression_target_gain, eyeWide saturates to 1.0 on
    # any moderately surprised/excited frame, giving the avatar a perma-
    # bug-eyed look. This cap clips ch 20/21 to a max value AFTER gain is
    # applied but BEFORE the final [0,1] clamp. Typical: 0.5–0.7. None →
    # no cap (backward compat).
    eye_wide_max: float | None = None

    # ─── Soft-clip the post-gain target instead of hard clamp [0,1] ──────
    # At very high gains, the dataset's hard clamp turns smooth emotional
    # cross-fades (e.g. neutral → joy ramp over ~15 frames) into instant
    # snap-to-1.0 transitions — the tail of the fade-in gets chopped off
    # because the post-gain value already exceeds 1 by frame ~5. Soft clip
    # replaces `min(x, 1)` with a smooth knee:
    #     y = x                                            if x ≤ knee
    #     y = 1 - (1-knee) * exp(-(x-knee)/(1-knee))       if x > knee
    # Below `knee` the curve is perfectly linear (no distortion of small/
    # mid amplitudes). Above `knee` it asymptotes toward 1 gracefully so
    # the ramp keeps moving even when source × gain is way past 1. Lower
    # bound stays hard-clipped at 0. False → hard clamp (backward compat).
    soft_clip: bool = False
    soft_clip_knee: float = 0.7

    # ─── Smooth the emotion one-hot in the conditioning vector ───────────
    # `cond[:, :16]` is a per-frame emotion one-hot. In multi-turn
    # scenarios (long_*) the one-hot steps instantly from one emotion to
    # the next at turn boundaries — the model sees joy=1.0 frame N then
    # sadness=1.0 frame N+1. Gaussian-smoothing the one-hot along the
    # time axis turns that step into a blended ramp (joy=0.7 + sadness=0.3
    # → joy=0.3 + sadness=0.7 over the transition window). The model
    # learns to produce smooth cross-fade output across boundaries. σ in
    # frames at 30 fps; σ=5 ≈ 167 ms, σ=10 ≈ 333 ms. 0 → no smoothing
    # (backward compat).
    # MUST also be applied at inference time (see infer.py --smooth-cond
    # -sigma-emotion) or the model gets a different cond distribution.
    smooth_cond_sigma_emotion: float = 0.0
