"""Export seed_train_final / seed_val / seed_test jsonl files into one Excel sheet.

Layout (1 row per scenario):
  # | scenario | emotion + VAD (per turn) | content (per turn, with speaker)
"""
from __future__ import annotations

import json
from pathlib import Path

from openpyxl import Workbook
from openpyxl.styles import Alignment, Font, PatternFill
from openpyxl.utils import get_column_letter

DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "emotion"
AUDIO_DIR = Path(__file__).resolve().parent.parent / "data" / "audio_preview"
AUDIO_REL = "data/audio_preview"  # path to write into the sheet (relative to repo root)
OUT_PATH = DATA_DIR / "dataset_export.xlsx"

SOURCES = [
    ("train", "seed_train_final.jsonl"),
    ("val",   "seed_val.jsonl"),
    ("test",  "seed_test.jsonl"),
]

# 10 solo (single-utterance) examples to pin at the very beginning of the sheet.
# Picked to span the V/A/D space so designers/body-motion actors see the full
# expression range up front before scrolling into multiturn dialogues.
PINNED_SOLO_IDS = [
    "solo_007",   # neutral
    "solo_027",   # joy
    "solo_011",   # sadness
    "solo_037",   # anger
    "solo_073",   # surprise
    "solo_045",   # excitement
    "solo_116",   # crying
    "solo_052",   # laughter
    "solo_020",   # shy
    "solo_069",   # fluster
]


def fmt_vad(vad):
    if not vad:
        return ""
    v, a, d = vad
    return f"V={v:+.2f} A={a:+.2f} D={d:+.2f}"


def build_emotion_vad_block(turns):
    """One line per turn: '1. emotion  V=.. A=.. D=..'"""
    lines = []
    for i, t in enumerate(turns, start=1):
        emo = t.get("emotion", "?")
        vad = fmt_vad(t.get("vad"))
        lines.append(f"{i}. {emo}   {vad}".rstrip())
    return "\n".join(lines)


def build_mp3_block(scenario_id, turns):
    """One line per turn: relative mp3 path. Marks missing files with (missing)."""
    lines = []
    for i, t in enumerate(turns):
        emo = t.get("emotion", "")
        fname = f"{scenario_id}_t{i}_{emo}.mp3"
        full = AUDIO_DIR / fname
        marker = "" if full.exists() else "  (missing)"
        lines.append(f"{AUDIO_REL}/{fname}{marker}")
    return "\n".join(lines)


def build_content_block(turns):
    """One line per turn: 'A: text   [emotion | V/A/D]'"""
    lines = []
    for t in turns:
        spk = t.get("speaker", "?")
        text = (t.get("text") or "").strip()
        emo = t.get("emotion", "")
        vad = fmt_vad(t.get("vad"))
        meta = f"[{emo} | {vad}]" if vad else f"[{emo}]"
        lines.append(f"{spk}: {text}   {meta}")
    return "\n".join(lines)


def load_records():
    records = []
    for split, name in SOURCES:
        p = DATA_DIR / name
        if not p.exists():
            print(f"[skip] {p} (not found)")
            continue
        with p.open(encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                obj = json.loads(line)
                records.append((split, obj))
        print(f"[load] {name}: {sum(1 for s, _ in records if s == split)} scenarios")

    # Pull the pinned solo examples to the top, in PINNED_SOLO_IDS order
    pinned, rest = [], []
    pin_set = set(PINNED_SOLO_IDS)
    seen = {}
    for split, obj in records:
        sid = obj.get("scenario_id", "")
        if sid in pin_set and sid not in seen:
            seen[sid] = (split, obj)
        else:
            rest.append((split, obj))
    for sid in PINNED_SOLO_IDS:
        if sid in seen:
            pinned.append(seen[sid])
        else:
            print(f"[warn] pinned solo not found: {sid}")
    print(f"[pin] {len(pinned)} solo examples pinned to top")
    return pinned + rest


def main():
    records = load_records()

    wb = Workbook()
    ws = wb.active
    ws.title = "AnimaSync Dataset"

    headers = ["#", "scenario", "내용 (speaker · text · emotion · VAD)", "mp3 path (per turn)"]
    ws.append(headers)

    # Header style
    header_font = Font(bold=True, color="FFFFFF")
    header_fill = PatternFill("solid", fgColor="305496")
    for col_idx, _ in enumerate(headers, start=1):
        c = ws.cell(row=1, column=col_idx)
        c.font = header_font
        c.fill = header_fill
        c.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)

    for i, (split, obj) in enumerate(records, start=1):
        scen_id = obj.get("scenario_id", "")
        setting = obj.get("setting", "")
        style = obj.get("style", "")
        scen_label = scen_id
        if setting:
            scen_label += f"\n({setting})"
        if style:
            scen_label += f"\n[{style}]"
        turns = obj.get("turns", [])

        row = [i, scen_label, build_content_block(turns), build_mp3_block(scen_id, turns)]
        ws.append(row)

        for col_idx in range(1, len(row) + 1):
            cell = ws.cell(row=i + 1, column=col_idx)
            cell.alignment = Alignment(
                horizontal="left" if col_idx >= 2 else "center",
                vertical="top",
                wrap_text=True,
            )

        # Force row height so all turn lines stay visible in Excel
        n_lines = max(len(turns), 3)  # scenario_label has up to 3 lines
        ws.row_dimensions[i + 1].height = max(20, n_lines * 16 + 6)

    # Column widths
    widths = {1: 6, 2: 28, 3: 90, 4: 55}
    for col_idx, w in widths.items():
        ws.column_dimensions[get_column_letter(col_idx)].width = w

    # Freeze header row
    ws.freeze_panes = "A2"

    # Auto filter on header row
    ws.auto_filter.ref = ws.dimensions

    wb.save(OUT_PATH)
    print(f"\n[done] {OUT_PATH}  ({len(records)} scenarios)")


if __name__ == "__main__":
    main()
