o
    js?                     @  s  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlZddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZm Z m!Z!m"Z" ee#$ j%d Z&e&d d d Z'e&d d d Z(e&d d Z)e&d d Z*e&d d Z+e&d d d d Z,e-dZ.dAdd Z/dBd&d'Z0e1 dCdDd+d,Z2	-dEdFd5d6Z3dGd7d8Z4dHd<d=Z5d>d? Z6e7d@kre6  dS dS )IuY  V3 end-to-end pipeline test:

    text  →  KlueTeacher  →  (emotion argmax, VAD)  →  cond
    audio →  mel
                                            ↓
                          V3 face model + locked post-processing
                                            ↓
                            (T, 52) ARKit blendshapes
                                            ↓
                         <sid>_e2e_dataset.json (viewer)

Writes alongside the existing `_dataset` (teacher GT) and `_pred_dataset`
(V3 with GT cond) entries so the viewer dropdown shows all three for
each scenario — lets us A/B which step degrades.

Usage:
    PYTHONPATH=. python3 -m models.v3_face.infer_e2e         --ckpt models/v3_face/checkpoints/best_expression.pt         -s long_001 long_100 long_046

    PYTHONPATH=. python3 -m models.v3_face.infer_e2e --all-viewer
    )annotationsN)Path)DictListOptional)gaussian_filter1d)AutoTokenizer)ARKIT_52_NAMES)EMOTION_LABELSEMOTION_TO_IDXFPSmel_featureslookup_audio_for_scenario)KlueTeacherForEmotionVAD   )crisp_mouthsmooth_browsinject_blinks
load_model   checkpointsklue_teacher_clean_ctx2zbest.pt	tokenizerdataaudio_preview
viewer_e2eemotionmodelsv3_facezbest_expression.ptz^(daily_.+)_t(\d+)$	ckpt_pathr   tokenizer_dirdevicetorch.devicec           
   	   C  sP  t j| |dd}t|tr|di ni }t|dd}|dddgi t|dd|d	d
|ddd}t	||j
jjkrK|j
t	| || }t|tra|d|d|n|}dd | D }|j|dd\}}	|sy|	rtdt	| dt	|	 d|dd  d td| d dd|  dt	| d ||fS )uK  Load the KLUE-RoBERTa emotion+VAD teacher and its tokenizer.

    The teacher was trained with 2 extra special tokens added to the base
    klue/roberta-base vocab (32000 → 32002). We load the tokenizer first to
    determine the actual vocab size, then resize the model's embeddings
    before loading the saved state_dict.
    F)map_locationweights_onlyconfig
model_namezklue/roberta-baseadditional_special_tokensz[SELF]z[OTHER]num_emotions   vad_dim   )r&   r(   r*   modelmodel_state_dictc                 S  s(   i | ]\}}| d d ddd|qS )zmodule. zmodel.r   )replace).0kv r3   P/dataset/kemix-engine/package/face/animasync-face-v3/models/v3_face/infer_e2e.py
<dictcomp>^   s    z load_teacher.<locals>.<dictcomp>)strictz[teacher] state_dict load: z
 missing, z unexpected (first 3 missing: N)z[teacher] loaded g    .A.1fzM params from z (vocab=)torchload
isinstancedictgetr   from_pretrainedadd_special_tokensr   lenbackboner%   
vocab_sizeresize_token_embeddingstoevalitemsload_state_dictprint
num_params)
r   r    r!   ckptcfgr   teacherstatemissing
unexpectedr3   r3   r4   load_teacher>   sD   







rP   sidstremotion_dirreturnOptional[dict]c           	   	   C  s"  dD ]}|| }|  sq| %}|D ]}t|}|d | kr.|  W  d     S qW d   n1 s9w   Y  t| }|r| =}|D ]2}t|}|d |dkr~t|d}| |d|g|d | gd  W  d     S qLW d   n1 sw   Y  qdS )zLocate a scenario across the three split JSONL files. Returns the
    parsed dict; for daily_*_t<i> splits, synthesizes a single-turn scenario
    so audio lookup reroutes via _source_* fields.)zseed_train_final.jsonlzseed_val.jsonlzseed_test.jsonlscenario_idNr   r   turns)rV   _source_scenario_id_source_turn_indicesrW   )existsopenjsonloadsSPLIT_REmatchgroupint)	rQ   rS   fnamepflinerowmtir3   r3   r4   find_scenarioi   s>   




ri      texts	List[str]c                 C  s   ||dd|dd |}| |d |d }|d }tj|dd  }|jdd  }	d	d
 |	D }
|d   }|
||fS )zgRun KlueTeacher on a batch of turn texts. Returns (argmax_emotions,
    full_probs (N,16), vads (N,3)).Tpt)padding
truncation
max_lengthreturn_tensors	input_idsattention_maskemotion_logits)dimc                 S  s   g | ]}t | qS r3   )r
   r0   ir3   r3   r4   
<listcomp>       z#teacher_predict.<locals>.<listcomp>vad)rD   r9   softmaxcpunumpyargmax)rL   r   rk   r!   rp   encoutlogitsprobspred_idxpred_emotions	pred_vadsr3   r3   r4   teacher_predict   s   
r         >@per_turn_emotionsper_turn_vads
np.ndarrayturn_Ts	List[int]vad_smooth_sigmafloatc                 C  s   t |}tj|dftjd}tj|dftjd}d}t| ||D ]'\}}	}
t|d}d||||
 |f< tj|	tjd||||
 < ||
7 }q |dkrjt|dkrjt	||ddd
tj}t	||ddd
tj}tj||gd	d

tjS )u  Build per-frame (T, 19) cond from teacher per-turn predictions.
    Mirrors data_pipeline.py: hard one-hot emotion + 3-D VAD per turn, then
    cross-turn Gaussian smoothing (σ=30 frames) on both. V3 was trained on
    this exact shape of cond signal, so feeding it the same shape (just
    with teacher predictions instead of GT labels) is the apples-to-apples
    pipeline test.
    r)   )dtyper+   r         ?r   nearest)sigmaaxismoderu   r   )sumnpzerosfloat32zipr   r=   asarrayr@   r   astypeconcatenate)r   r   r   r   n_totalall_emosall_vadscursoremor{   Tidxr3   r3   r4   build_cond_smoothed   s*   
r   c           '        s  t | |j}|d u rtd|  d d S t||j}g }	g }
g }tt|d |D ]E\}\}}|dd }|rA|d u sA|	 sBq)t
jt|ddd\}}t|d	k rUq)t||td
}|	| |
|jd  || q)|s{td|  d d S dd |D }t||||\}}}t|||
|jd}tj|	ddtj}t|d|}t|d|}t  |||d   tj}W d    n1 sw   Y  |j!rt"||j#|j$|j%|j&d}|j'rt(||j)|j*|j+d}|j,rt-|| |j.|j/d}g }tt||||D ]B\}\}}}  fddt0 dd  d d d D } ||||dd|1 t2|dg d|dd|dd| d q|  d}!|!dt3|jd t4|t5|d1 d }"|j6|! d! }#|#7t8j9|"d"d# |j6|  d$ }$|j6|! d% }%|%	 s|%: r|%;  |$	 r|%<|$j= ntd&|  d' t>d(d) t||D }&td*|  d+|& d,t| d-|jd   | |!|&t|d.S )/Nu     ✗ z: scenario not foundrW   textr.   i>  T)srmonog      @)r   fpsr   z: no valid turnsc                 S  s   g | ]}|d  qS )r   r3   )r0   tr3   r3   r4   ry      rz   z$predict_and_save.<locals>.<listcomp>)r   r   )	thresholdscalepre_smooth_sigmamouth_close_sigma)
min_cutoffbetad_cutoff)rV   mean_interval_sexpressive_capc                   s"   g | ]}t | t | d qS ))labelprob)r
   r   rw   r   r3   r4   ry      s    ru   r   neutralr{   )r   r   r   speaker)turn_idxr   
gt_emotionr{   gt_vadr   r   top3_emotions_e2e_dataset      )rV   r   
num_framesnamesrW   blendshapesz.jsonF)ensure_asciiz_dataset.mp3z.mp3z  ! z,: teacher mp3 missing; e2e will play silent.c                 s  s(    | ]\}}| d d|krdV  qdS )r   r   r   Nr=   )r0   r   r   r3   r3   r4   	<genexpr>  s   
 z#predict_and_save.<locals>.<genexpr>u     ✓ z  emotion match /z	  frames=)rQ   new_basematchestotal)?ri   rS   rH   r   	audio_dir	enumerater   r=   striprZ   librosar:   rR   r@   r   r   appendshaper   r   r   r   r   r   r   r9   
from_numpy	unsqueezerD   no_gradsqueezer}   r~   crispr   crisp_thresholdcrisp_scalecrisp_sigmacrisp_mouthclose_sigmabrow_smoothr   brow_min_cutoff	brow_betabrow_d_cutoff
add_blinksr   blink_intervalblink_expressive_capargsorttolistlistra   r	   round
viewer_dir
write_textr\   dumps
is_symlinkunlink
symlink_tonamer   )'rQ   r,   rK   rL   r   r!   argsscenaudio_paths	turn_melsr   valid_turnsrh   turnapr   wavr   melrk   	pred_emos
pred_probsr   condaudioaudio_tcond_tpred
turns_metar   r   r{   top3r   viewer_jsonout_jsonteacher_mp3pred_mp3r   r3   r   r4   predict_and_save   s   


"





r  r   predictions
List[dict]c                 C  s   | d }|  rt| ndg i}dd |d D }|D ]*}|d }||d  di }||g |dd	|d
g d|dd d||< qt| |d< |tj|ddd d S )Nzmanifest.json	scenariosc                 S  s   i | ]}| d |qS )baser   )r0   sr3   r3   r4   r5     s    z#update_manifest.<locals>.<dictcomp>r   rQ   _datasetn_turnsr   emotionsz	[V3 e2e] text_previewr.   )r  rV   variantsr
  r  r  Fr   )r   indent)	rZ   r\   r]   	read_textr=   r   valuesr   r   )r   r  manifest_pathmanifestseenrc   r   teacher_entryr3   r3   r4   update_manifest  s$   

r  c               
   C  s,  t  } | jdttd | jdttd | jdttd | jdttd | jdttd | jdtt	d | jdd	d
 | jdddd d | jdddd | jdt
ddd | jdt jdd | jdt
dd | jdt
dd | jdt
dd | jd t
dd | jd!t jdd | jd"t
d#d | jd$t
d%d | jd&t
dd | jd't jdd | jd(t
d)d | jd*t
d+d |  }|js|js| d, |jrg }t|jd-D ]}|j}d.|v sd/|v rq||d td0   qtd1t| d2 n|j}t|j}td3|j  t|j|\}}td4|j  t|j|j|\}}	td5 td6t| d7|j d8 g }
d9}d9}|D ]!}t|||||	||}|r_|
| ||d: 7 }||d; 7 }q?|
rt|j|
 td< td=| d>| d?d@| t |dA dBdC tdD tdE tdF tdG d S )HNz--ckpt)typedefaultz--teacher_ckptz--tokenizer_dirz--audio_dirz--viewer_dirz--emotion_dirz--devicezcuda:0)r  z-sz--scenarios+)nargsr  z--all-viewer
store_truezRun E2E on every scenario currently curated in data/viewer/. Preserves existing _dataset and _pred_dataset entries; only adds _e2e_dataset.)actionhelpz--vad-smooth-sigmar   u;   Cross-turn Gaussian σ. Default 30 (matches data_pipeline).)r  r  r  z--crispT)r  r  z--crisp-thresholdg333333?z--crisp-scaler   z--crisp-sigmag?z--crisp-mouthclose-sigmaz--brow-smoothz--brow-min-cutoffg       @z--brow-betag{Gz?z--brow-d-cutoffz--add-blinksz--blink-intervalg      @z--blink-expressive-capg      ?z#provide --scenarios or --all-viewerz*_dataset.json_pred_datasetr   r	  zall-viewer: z scenarios curatedzLoading V3 from zLoading KlueTeacher from u6   
Pipeline: text → KlueTeacher → V3 → blendshapeszPredicting u    scenarios → 
r   r   r   u&   
── teacher argmax accuracy ──z  matches GT label: r   z  (d   r   r8   z%)u>   
Done. Each scenario now appears up to 3× in viewer dropdown:z   <sid>_dataset       teacher GTzK  <sid>_pred_dataset  V3 with GT cond (argmax of teacher's training labels)z8  <sid>_e2e_dataset   V3 with KlueTeacher-predicted cond)!argparseArgumentParseradd_argumentr   DEFAULT_CKPTDEFAULT_TEACHER_CKPTDEFAULT_TOKENIZER_DIRDEFAULT_AUDIO_DIRDEFAULT_VIEWER_DIRDEFAULT_EMOTION_DIRr   BooleanOptionalAction
parse_argsr  
all_viewererrorsortedr   globstemr   r@   rH   r9   r!   rJ   r   teacher_ckptrP   r    r  r  max)r   r   sidsrc   r/  r!   r,   rK   rL   r   r  total_mtotal_trQ   rr3   r3   r4   main)  s   



r6  __main__)r   r   r    r   r!   r"   )rQ   rR   rS   r   rT   rU   )rj   )rk   rl   )r   )
r   rl   r   r   r   r   r   r   rT   r   )rQ   rR   rT   rU   )r   r   r  r  )8__doc__
__future__r   r   r\   repathlibr   typingr   r   r   r   r~   r   r9   scipy.ndimager   transformersr   scripts.compiler.constantsr	   scripts.compiler.data_pipeliner
   r   r   r   r   models.microalbert.teacherr   inferr   r   r   r   __file__resolveparentsPROJECT_ROOTr$  r%  r&  r'  r(  r#  compiler^   rP   ri   r   r   r   r  r  r6  __name__r3   r3   r3   r4   <module>   s`    



+

dJ
