o
    0jRn                     @  s  d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	 ddl
ZddlZddlmZ ddlmZmZmZmZ ddlmZ d	d
lmZ d	dlmZ ee jd Zed d Zed d Z ed d Z!ed d d d Z"ddgZ#dZ$e%e&ee&eB Z'g dZ(ddgZ)ddgZ*g dZ+		 	!dadbd*d+Z,	,	-	.dcddd5d6Z-		 	!	dedfd7d8Z.	9	 	!	dgdfd:d;Z/	9	 	!	dgdfd<d=Z0	!	 	!	dhdfd>d?Z1	@	!	!	didjdFdGZ2dkdLdMZ3dldQdRZ4dmdVdWZ5dndXdYZ6dod\d]Z7d^d_ Z8e9d`kre8  dS dS )puG  V3 face inference — run the trained student on baked scenarios and write
the prediction as a viewer JSON so you can A/B against the teacher target.

Usage:
    # Predict a few scenarios, write viewer files, also print per-channel L1
    PYTHONPATH=. python3 -m models.v3_face.infer -s long_001 long_046 long_001_p0

    # Use latest.pt instead of best.pt
    PYTHONPATH=. python3 -m models.v3_face.infer --ckpt models/v3_face/checkpoints/latest.pt -s long_001

    # All curated viewer scenarios (anything currently in data/viewer/)
    PYTHONPATH=. python3 -m models.v3_face.infer --all-viewer

Output:
    data/viewer/<sid>_pred_dataset.json   (V3 student prediction)
    data/viewer/<sid>_pred_dataset.mp3    (symlink to teacher's mp3 — same audio)
    manifest.json updated with the _pred entries so the player dropdown shows them
    )annotationsN)Path)DictList)gaussian_filter1d)ARKIT_52_NAMESLIPSYNC_ONLYEXPRESSION_ONLYSHARED_CHANNELS)
add_blinks   )V3FaceConfig)V3FaceModel   datav3_trainingvieweremotionmodelsv3_facecheckpointszbest.pt   	      )r   r   r                     )
                              >@      ?{Gz?      ?signal
np.ndarrayfpsfloat
min_cutoffbetad_cutoffreturnc                   s   ddl  t| }|dkr| jtjddS tj|tjd}t| d |d< d}d| } fd	d
}	td|D ]B}
|	||}t| |
 ||
d   | }|| d| |  }|}||t|  }|	||}|t| |
  d| ||
d    ||
< q5|S )zAdaptive low-pass. Cutoff rises with |dx|, so fast changes pass through
    with minimal smoothing and slow drift gets aggressively smoothed.
    Reference: animasync-face-v1/training/inference.py::OneEuroFilter.
    r   Nr   T)copy)dtype        r+   c                   s   d j  | |  }||d  S )N       @r+   )pi)t_ecutoffrmath L/dataset/kemix-engine/package/face/animasync-face-v3/models/v3_face/infer.pysfR   s   z_one_euro.<locals>.sf)	r=   lenastypenpfloat32zerosr/   rangeabs)r,   r.   r0   r1   r2   Toutdx_prevter@   ia_ddxdx_hatr:   ar>   r<   r?   	_one_euroB   s$   

*rQ         @      ?   blendshapesscenario_idstrmean_interval_sexpressive_capintc                 C  s   |   tj}tj|dddf ||dddf d tj|dddf ||dddf d dD ]}tj|dd|f d|dd|f d q2t||||dS )u  Inject procedurally generated Poisson-distributed blinks on channels
    8 and 9. Blink timing is decoupled from audio in real speech, so V3 can't
    learn it; we use the same deterministic generator the teacher uses
    (scripts.compiler.eye_motion.add_blinks) seeded by scenario_id.

    The model's existing values on channels 8 and 9 are preserved up to
    `expressive_cap` so legitimate sustained partial closure (squinty smile,
    sleepy, crying — typically 0.2–0.4) survives. add_blinks composes via
    max(), so the procedural 0.70 blink peaks always show through. Anything
    the model produced above the cap (e.g., the solo failure mode where it
    locks at 0.99) is clamped — that's a model failure, not expression.

    Also caps eyeSquint (18, 19) and eyeWide (20, 21) at 0.84 (just under the
    0.85 suppression threshold in add_blinks) so a misprediction in those
    channels doesn't block blinks entirely.
    Nr   )rI   r   )r   r   r   r   gzG?)seed_strr.   rX   )r4   rB   rC   rD   minimum_add_blinks_proc)rU   rV   rX   rY   r.   resultchr>   r>   r?   inject_blinksa   s   ((*r`   c                 C  Z   |   tj}tD ]}t|dd|f ||||d|dd|f< q
t|ddtjS )zhApply One-Euro to the 5 brow channels. Transition spikes preserved,
    intra-turn jitter smoothed.
    Nr.   r0   r1   r2   r6   r+   )r4   rB   rC   rD   BROW_CHANNELSrQ   cliprU   r0   r1   r2   r.   r^   r_   r>   r>   r?   smooth_brows~   s   rf   皙?c                 C  ra   )u   Apply One-Euro to eyeSquint L/R (ch 18, 19). Heavier than brows by
    default (min_cutoff 0.8 vs 1.5) — orbicularis oculi is naturally slow
    and sustained, so we can attenuate sub-Hz wiggle without losing real
    squints.
    Nrb   r6   r+   )r4   rB   rC   rD   EYE_SQUINT_CHANNELSrQ   rd   re   r>   r>   r?   smooth_eye_squint      
ri   c                 C  ra   )u   Apply One-Euro to eyeWide L/R (ch 20, 21). Same shape as eyeSquint —
    paired anti-correlated channel that transitions from surprise to other
    emotions, where smoothing helps the cross-fade look continuous.
    Nrb   r6   r+   )r4   rB   rC   rD   EYE_WIDE_CHANNELSrQ   rd   re   r>   r>   r?   smooth_eye_wide   s   	rl   c                 C  ra   )u)  Apply One-Euro to the 8 eyeLook channels (10–17 = gaze direction /
    pupils). The compiler's iris drift is already smooth; this filter kills
    the model's residual high-frequency reproduction noise. Slightly higher
    cutoff than eyeSquint so saccade-like rapid gaze shifts still pass.
    Nrb   r6   r+   )r4   rB   rC   rD   EYE_LOOK_CHANNELSrQ   rd   re   r>   r>   r?   smooth_eye_look   rj   rn   333333?	thresholdscalepre_smooth_sigmamouth_close_sigmafloat | Nonec                 C  s   |du r|}|   tj}|d }|d }tD ]X}|tkr |n|}	|dd|f   }
|	dkr6t|
|	d}
|
 dkrBt|
 nd}|
| }t	|| ||  dd}|| dd	|   }t	|
| | dd|dd|f< q|S )
u   V1's crisp_mouth — soft-threshold gate on mouth/jaw channels.

    For each lipsync-branch channel:
      1. Pre-smooth with a small Gaussian (kills HF noise without distorting
         legitimate phoneme onsets)
      2. Normalize to [0, 1] by channel max
      3. Apply smoothstep gate with edges (0.3·threshold, 1.2·threshold) —
         values below 0.3·threshold fade to 0, values above 1.2·threshold
         pass through fully
      4. Scale and clip

    Args:
        pre_smooth_sigma: Gaussian σ for every lipsync channel EXCEPT
            mouthClose.
        mouth_close_sigma: Gaussian σ for mouthClose (ch 26). If None,
            falls back to `pre_smooth_sigma`. Set lower than the main sigma
            to keep m/b/p closures crisp while smoothing the rest.

    Reference: animasync-face-v1/deployment/lipsync_distilled.py::crisp_mouth.
    The smoothstep gate is the key: it kills sub-threshold jitter (our 13%
    HF residual) WITHOUT being a low-pass filter that smears phoneme attacks.
    Nro   g333333?r   )sigmar+   r6   g      @r7   )
r4   rB   rC   rD   CRISP_CHANNELSMOUTH_CLOSE_CHr   maxr/   rd   )rU   rp   rq   rr   rs   r^   edge0edge1r_   sigma_chvalsmax_val
normalizedtgater>   r>   r?   crisp_mouth   s    $r   	ckpt_pathr   devicetorch.devicec                 C  s   t j| |dd}|d }t }| D ]\}}t||r/|dkr)t|tr)t|}t||| qt	|
|}||d  |  td|  d|dd	 d
|dtddd|jd dd	 ||fS )NF)map_locationweights_onlyconfig	dilationsmodelz[ckpt] z  epoch=epoch?z	  val_l1=val_l1nan.4fz	  params=g    .Az.2fM)torchloadr   itemshasattr
isinstancelisttuplesetattrr   toload_state_dictevalprintgetr/   n_params)r   r   ckptcfg_dictcfgkvr   r>   r>   r?   
load_model   s$   


r   predtargetDict[str, float]c           	      C  s   | j |j krt| j d |j d }| d| |d| } }t| | jdd}t|t  }t|t  }t|t  }t|t	  }t| }|||||dS )z'Average L1 grouped by channel category.r   N)axis)overalllipsync
expressionsharedblink)
shapeminrC   rG   meanr/   r   r	   r
   	EYE_BLINK)	r   r   rH   difflipexpshrr   r   r>   r>   r?   per_channel_l1  s   r   sidemotion_dir
List[dict]c                 C  sh  ddl }|d}dD ]}|| }| sq| .}|D ]#}t|}|d | kr@dd t|d D   W  d     S qW d   n1 sKw   Y  || }	|	r| N}|D ]C}t|}|d |	d	krt	|	d
}
|d |
 }|
|
dd|
dg d|
dd|
dddg  W  d     S q^W d   n1 sw   Y  qg S )zAFind turns metadata for a scenario across the three split JSONLs.r   Nz^(daily_.+)_t(\d+)$)zseed_train_final.jsonlzseed_val.jsonlzseed_test.jsonlrV   c              
   S  sT   g | ]&\}}| d d r|| dd| dg d| d d| dddqS )text r   neutralvadr   r   r   speakerturn_idxr   r   r   r   )r   strip).0tir   r>   r>   r?   
<listcomp>  s    
z#load_turns_meta.<locals>.<listcomp>turnsr   r   r   r   r   r   r   r   r   r   )recompileexistsopenjsonloads	enumeratematchgrouprZ   r   )r   r   r   SPLIT_REfnamepflinerowmr   r   r>   r>   r?   load_turns_meta  sH   











r   c                 C  sF  |j |  d }| std|  d|  i S t|}t|d tj	d
|}|d tj}t|ddp=d}|dkrZt|d	d	d	d
f |ddd|d	d	d	d
f< t|	d
|}	|d tj}
t  |||	d  tj}W d	   n1 sw   Y  t|ddrt||j|j|j|jd}t|ddrt||j|j|jd}t|ddrt||j|j|jd}t|ddrt||j|j |j!d}t|ddrt"||j#|j$|j%d}t|ddrt&|| |j'|j(d}t)||
}|j*rd|j* nd}|  d| d}|dt+|j,d t-t.| |j/t0|d1 d}|j2| d }|3t4j5|dd  |j2|  d! }|j2| d" }| sU|6 rY|7  | re|8|j9 ntd#|  d$| d% td&|  d'|d( d)d*|d+ d)d,|d- d)d.|d/ d)d0|d1 d)d2|j,d  d3 | |d4|S )5zDRun model on one scenario, write viewer JSON, return per-channel L1.z.npzu     ✗ z: no npz found at audior   condsmooth_cond_sigma_emotionr6   Nr&   nearest)ru   r   moder   crispF)rp   rq   rr   rs   brow_smooth)r0   r1   r2   eye_squint_smootheye_wide_smootheye_look_smoothr   )rV   rX   rY   _r   _pred_datasetrT   r   )rV   r.   
num_framesnamesr   rU   z.json)ensure_asciiz_dataset.mp3z.mp3z  ! z: teacher mp3 not found at uG    — the prediction will play silently. Run dataset_to_viewer.py first.u     ✓ z  L1: overall=r   r   z  lip=r   z  exp=r   z  shr=r   z  blink=r   z  (z frames))r   new_base):npz_dirr   r   rC   r   r   
from_numpyrB   rD   	unsqueezer   getattrr   no_gradsqueezecpunumpyr   crisp_thresholdcrisp_scalecrisp_sigmacrisp_mouthclose_sigmarf   brow_min_cutoff	brow_betabrow_d_cutoffri   eye_squint_min_cutoffeye_squint_betaeye_squint_d_cutoffrl   eye_wide_min_cutoffeye_wide_betaeye_wide_d_cutoffrn   eye_look_min_cutoffeye_look_betaeye_look_d_cutoffr`   blink_intervalblink_expressive_capr   variant_tagrZ   r   r   r   r   roundtolist
viewer_dir
write_textr   dumps
is_symlinkunlink
symlink_toname)r   r   r   argsnpz_pathr   r   cond_npsmooth_sigmar   r   r   metricsvariant_partr   viewer_jsonout_jsonteacher_mp3pred_mp3r>   r>   r?   predict_and_save5  s   
"
"



r  r  predictionsc                 C  s   | d }|  rt| ndg i}dd |d D }|D ]*}|d }||d  di }||g |dd	|d
g d|dd d||< qt| |d< |tj|ddd dS )z8Add _pred_dataset entries to the viewer's manifest.json.zmanifest.json	scenariosc                 S  s   i | ]}| d |qS )base)r   )r   sr>   r>   r?   
<dictcomp>  s    z#update_manifest.<locals>.<dictcomp>r   r   r   n_turnsr   emotionsz
[V3 pred] text_previewr   )r  rV   variantsr  r  r  Fr   )r   indentN)	r   r   r   	read_textr   r   valuesr  r  )r  r  manifest_pathmanifestseenr   r   teacher_entryr>   r>   r?   update_manifest  s$   

r'  c                  C  s  t  } | jdttd | jdttd | jdttd | jdttd | jddd | jd	td
dd | jdt	d dd | jdddd dd | jdddd | jdddd | jdtddd | jdtddd | jd tdd!d | jd"tdd#d | jd$dd%d | jd&td'd(d | jd)td*d+d | jd,tdd-d | jd.dd/d | jd0td1d2d | jd3td*d4d | jd5tdd6d | jd7dd8d | jd9td1d:d | jd;td*d<d | jd=tdd>d | jd?dd@d | jdAtddBd | jdCtd*dDd | jdEtddFd | jdGddHd | jdItdJdKd | jdLtdMdNd | 
 }|js2|js2| dO |jreg }t|jdPD ]}|j}dQ|v rLq@||d tdR   q@tdSt| dT n|j}t|j}t|j|\}}tdUt| dV|j  g }|D ]}	t|	|||}
|
r||
 q|rt|j| tdWdX |D }tdYdX |D }tdZdX |D }td[dX |D }td\t| d] td^|d_ td`|d_ tda|d_ tdb|d_ tdc tdd tde d S )fNz--ckpt)typedefaultz	--npz_dirz--viewer_dirz--emotion_dirz--devicezcuda:0)r)  z--smooth-cond-sigma-emotionr6   u   Gaussian σ (frames @ 30 fps) for smoothing the emotion one-hot in cond[:, :16]. MUST match the value used during training of the loaded checkpoint. 0 = no smoothing (default).)r(  r)  helpz--variant-tagzOptional tag baked into output filenames ({sid}_pred_{tag}_dataset.json). Use to keep predictions from different gain variants from clobbering each other in data/viewer/.z-sz--scenarios+z!Specific scenario IDs to predict.)nargsr)  r*  z--all-viewer
store_truezcPredict every scenario currently curated in data/viewer/ (i.e. every *_dataset.json without _pred).)actionr*  z--crispzApply V1-style crisp_mouth post-processing to lipsync channels (Gaussian pre-smooth + smoothstep soft-threshold gate). Kills sub-threshold mouth jitter without smearing phoneme attacks. Reference: animasync-face-v1/deployment/lipsync_distilled.py.z--crisp-thresholdro   z4Smoothstep center for crisp_mouth gate. Default 0.3.z--crisp-scaler+   zhAmplification factor after the gate. Default 1.0 (V1 used 1.2 to compensate for the gate's attenuation).z--crisp-sigmau   Gaussian σ (frames) for pre-smoothing before the gate, applied to every lipsync channel EXCEPT mouthClose. Default 1.0 (V1's smooth_frames=2 → σ=1).z--crisp-mouthclose-sigmau   Separate Gaussian σ for mouthClose (ch 26). Default 1.0. Keep at or below --crisp-sigma so m/b/p phoneme closures stay sharp while the rest of the mouth can be slightly more smoothed.z--brow-smoothzApply gentle One-Euro adaptive filter to brow channels (0-4). Transition spikes pass through; intra-turn micro-jitter is softened. Matches V1's brow deployment.z--brow-min-cutoffr)   zOne-Euro min_cutoff for brows. V1 used 1.5 in deployment, 2.0 in training inference. Default 1.5 (barely visible smoothing). Raise toward 2.0 for even subtler effect.z--brow-betar*   u   One-Euro β — how fast the cutoff rises with |dx|. Higher = transitions pass through faster. Default 0.01 (V1's brow-deployment value).z--brow-d-cutoffz(One-Euro derivative cutoff. Default 1.0.z--eye-squint-smoothu  Apply One-Euro adaptive filter to eyeSquint L/R (ch 18, 19). Mirrors --brow-smooth but heavier — orbicularis oculi is naturally slow so we can smooth sub-Hz wiggle hard without killing real squints. Use this when training-time target smoothing alone doesn't kill the visible jitter.z--eye-squint-min-cutoffrg   z_One-Euro min_cutoff for eyeSquint. Lower than brows (default 0.8 vs 1.5) for heavier smoothing.z--eye-squint-betau(   One-Euro β for eyeSquint. Default 0.01.z--eye-squint-d-cutoffz6One-Euro derivative cutoff for eyeSquint. Default 1.0.z--eye-wide-smoothu   Apply One-Euro adaptive filter to eyeWide L/R (ch 20, 21). Paired with eyeSquint for surprise → other-emotion transitions where eyeWide↓ and eyeSquint↑ cross discontinuously.z--eye-wide-min-cutoffz-One-Euro min_cutoff for eyeWide. Default 0.8.z--eye-wide-betau&   One-Euro β for eyeWide. Default 0.01.z--eye-wide-d-cutoffz4One-Euro derivative cutoff for eyeWide. Default 1.0.z--eye-look-smoothu  Apply One-Euro adaptive filter to the 8 eyeLook channels (10–17, gaze direction / pupils). The compiler's iris drift is already smooth; this kills the model's residual reproduction noise. Cutoff slightly higher than eyeSquint so saccade-like rapid shifts still pass.z--eye-look-min-cutoffz6One-Euro min_cutoff for eyeLook (pupils). Default 1.0.z--eye-look-betau&   One-Euro β for eyeLook. Default 0.01.z--eye-look-d-cutoffz4One-Euro derivative cutoff for eyeLook. Default 1.0.z--add-blinksa  Replace V3's blink channels (8, 9) with procedural Poisson-distributed blinks. Uses the same generator as the training teacher (eye_motion.add_blinks), seeded deterministically by scenario_id. Use this because blink timing isn't audio-determined and V3 cannot learn it from audio alone.z--blink-intervalrR   u   Mean blink interval in seconds. Default 4.0 (matches the teacher's bake-time value). Real human blink rate is 15-20/min ≈ 3-4 s.z--blink-expressive-caprS   a  Cap for the model's pre-existing eyeBlink (ch 8, 9) values. Below the cap, expressive sustained eye closure (squinty smile, sleepy) survives. Above, broken predictions (locked-shut eyes) are clamped. Procedural blinks always poke through via max(). Default 0.5.z#provide --scenarios or --all-viewerz*_dataset.json_pred_datasetr   zall-viewer: z
 scenariosz
Predicting u    scenarios → c                 S     g | ]}|d  qS )r   r>   r   r   r>   r>   r?   r   E      zmain.<locals>.<listcomp>c                 S  r0  )r   r>   r1  r>   r>   r?   r   F  r2  c                 S  r0  )r   r>   r1  r>   r>   r?   r   G  r2  c                 S  r0  )r   r>   r1  r>   r>   r?   r   H  r2  u   
── aggregate over u    scenarios ──z  overall L1: r   z  lipsync L1: z  expression L1: z  blink L1: z;
Done. Hard-reload viewer. Each scenario now appears twice:z%  <sid>_dataset      = teacher targetz,  <sid>_pred_dataset = V3 student prediction)argparseArgumentParseradd_argumentr   DEFAULT_CKPTDEFAULT_NPZ_DIRDEFAULT_VIEWER_DIRDEFAULT_EMOTION_DIRr/   rW   
parse_argsr  
all_viewererrorsortedr  globstemappendrA   r   r   r   r   r   r  r'  rC   r   )apr  sidsr   r?  r   r   r   r  r   r^   r   r   r   r   r>   r>   r?   main  s   





















rC  __main__)r(   r)   r*   r+   )r,   r-   r.   r/   r0   r/   r1   r/   r2   r/   r3   r-   )rR   rS   rT   )rU   r-   rV   rW   rX   r/   rY   r/   r.   rZ   r3   r-   )r)   r*   r+   r(   )rU   r-   r0   r/   r1   r/   r2   r/   r.   r/   r3   r-   )rg   r*   r+   r(   )r+   r*   r+   r(   )ro   r+   r+   N)rU   r-   rp   r/   rq   r/   rr   r/   rs   rt   r3   r-   )r   r   r   r   )r   r-   r   r-   r3   r   )r   rW   r   r   r3   r   )r   rW   r3   r   )r  r   r  r   ):__doc__
__future__r   r3  r   pathlibr   typingr   r   r   rC   r   scipy.ndimager   scripts.compiler.constantsr   r   r	   r
   scripts.compiler.eye_motionr   r]   r   r   r   r   __file__resolveparentsPROJECT_ROOTr7  r8  r9  r6  r   rw   r=  setrv   rc   rh   rk   rm   rQ   r`   rf   ri   rl   rn   r   r   r   r   r  r'  rC  __name__r>   r>   r>   r?   <module>   s     
-


#
k 
