#!/usr/bin/env python3
"""Diagnose why anger turn shows less brow movement than sadness.

Compares:
  - compiler baseline vs V2 baseline (per channel)
  - headroom for additive dynamics (1.0 - comp_bs)
  - V2 motion amplitude per channel
  - resulting motion after clipping
"""
import sys, json
from pathlib import Path
import numpy as np
import librosa
import onnxruntime as ort

sys.path.insert(0, '/dataset/text-to-face-se/LAM_Audio2Expression')
from distillation.student_model import AudioFeatureExtractor

from scripts.compiler.blend import compile_batch
from scripts.compiler.constants import ARKIT_52_NAMES
from scripts.compiler.utils import load_presets_from_json
from scripts.compiler.abc_experiment import (
    run_v2, highpass_per_channel, SUB_TO_BASE, V2_TO_ANIMA,
)

PROJECT_ROOT = Path('/dataset/AnimaSync-mic-fix')
PRESETS = PROJECT_ROOT / 'expression_presets.json'

CASES = [
    ('daily_003_t0_struggle.mp3', 'struggle', [-0.6, -0.3, -0.4]),  # sad-like VAD
    ('daily_002_t2_anger.mp3',    'anger',    [-0.6, +0.7, +0.5]),  # angry VAD
]

BROW_CHANNELS = [
    'browInnerUp', 'browDownLeft', 'browDownRight',
    'browOuterUpLeft', 'browOuterUpRight',
    'cheekSquintLeft', 'cheekSquintRight',
    'noseSneerLeft', 'noseSneerRight',
    'eyeSquintLeft', 'eyeSquintRight',
]

presets = load_presets_from_json(PRESETS)
v2_sess = ort.InferenceSession(
    '/dataset/mead-expression-training/e2f/distill/emotion_face_v8_brow09.onnx',
    providers=['CPUExecutionProvider'])
v2_feat = AudioFeatureExtractor()

print(f"{'channel':20s} │ {'comp_bs':>10s} │ {'V2 baseline':>12s} │ {'V2 motion std':>13s} │ {'headroom':>10s} │ {'final motion':>13s}")
print("─" * 100)

for audio_fn, emo, vad in CASES:
    wav_path = PROJECT_ROOT / 'data' / 'audio_preview' / audio_fn
    wav, sr = librosa.load(str(wav_path), sr=16000, mono=True)
    T = int(len(wav) / sr * 30)

    # Compiler output (static per turn)
    comp_bs = compile_batch(
        emotions=[emo] * T,
        vads=np.tile(np.asarray(vad, dtype=np.float32), (T, 1)),
        presets=presets,
        apply_lipsync_mask=False,
    )

    # V2 output in AnimaSync order
    v2_bs = run_v2(v2_sess, v2_feat, wav, emo)
    v2_dyn = highpass_per_channel(v2_bs, sigma_frames=15.0)

    # Align
    T_use = min(comp_bs.shape[0], v2_bs.shape[0])
    comp_bs = comp_bs[:T_use]
    v2_bs = v2_bs[:T_use]
    v2_dyn = v2_dyn[:T_use]

    print(f"\n═══ {emo.upper()}  ({audio_fn}) ═══")

    for name in BROW_CHANNELS:
        ch = ARKIT_52_NAMES.index(name)
        comp_mean = comp_bs[:, ch].mean()
        v2_mean = v2_bs[:, ch].mean()
        v2_std = v2_dyn[:, ch].std()
        headroom = 1.0 - comp_mean

        # Actual applied motion after clip
        alpha = 0.5
        final = np.clip(comp_bs[:, ch] + alpha * v2_dyn[:, ch], 0.0, 1.0)
        final_motion = final.std()

        print(f"  {name:20s} │ {comp_mean:>10.3f} │ {v2_mean:>12.3f} │ {v2_std:>13.4f} │ {headroom:>10.3f} │ {final_motion:>13.4f}")
