o
    ^jĘ                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlZddlZddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z m!Z! i ddddddddddddddddddddddddddddddddZ"g dZ#dZ$d Z%e&d!Z'ee() j*d" Z+e+d# d$ d% Z,e+d# d& Z-e+d# d' Z.d(Z/g d)Z0d*d+ e1e0D Z2	-ddd8d9Z3dd=d>Z4dd@dAZ5ddCdDZ6dddLdMZ7ddQdRZ8ddTdUZ9ddZd[Z:	\	^	_dddddeZ;	]	^dddgdhZ<	i	j	kdddodpZ=	i	j	kdddqdrZ>	sdddudvZ?	dddzd{Z@	|ddddZAdddZB	|	 				\	s		_		]			i	j	kddddZCdd ZDeEdkr^eFeD  dS dS )u  Training data generation pipeline for V3 lipsync model.

Produces .npz triples per scenario:
    - audio_features: (T, 141) [mel or wav2vec features — TBD, simple mel for now]
    - conditioning: (T, 19) [16 emotion one-hot + 3 VAD]
    - target: (T, 52) [LAM lipsync + compiler expression merged by channel rules]

Usage:
    python -m scripts.compiler.data_pipeline --limit 10   # test run
    python -m scripts.compiler.data_pipeline              # full run
    )annotationsN)Path)List   )LAM_WEIGHTS_SHAREDLIPSYNC_ONLYEXPRESSION_ONLYSHARED_CHANNELS)compile_expressive_batch)apply_eye_motion)
LAMWrapper)apply_tremorsilence_gate_from_wav)	synth_all)build_synthetic_presetsload_presets_from_jsonneutraljoylaughter
excitement	agreement	gratitudesadnesscryingsulkapologystruggleangerrefusalsurpriseflustershy)r   r            皙?皙?data_pipeliner"   dataemotionzseed_train_final.jsonlv3_trainingaudio_preview   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   c                 C  s   i | ]\}}||qS  r-   ).0ier-   r-   V/dataset/kemix-engine/package/face/animasync-face-v3/scripts/compiler/data_pipeline.py
<dictcomp>H       r2   
elevenlabsr$   turns
List[dict]out_dirr   backendstrconcurrencyintreturn
List[Path]c           
        s    j ddd dd | D }dd | D }dd | D } fddtt| D }t||f||||d|I d	H }	d
d t||	D S )zDTTS all turns via selected backend, passing emotion+VAD for prosody.Tparentsexist_okc                 S     g | ]}|d  qS )textr-   r.   tr-   r-   r1   
<listcomp>P       z%synth_turns_batch.<locals>.<listcomp>c                 S     g | ]}| d qS r)   getrC   r-   r-   r1   rE   Q   r3   c                 S  rG   vadrI   rC   r-   r-   r1   rE   R   r3   c                   s   g | ]
} |d d qS )06d.mp3r-   )r.   r/   r7   r-   r1   rE   S       )r8   r:   emotionsvadsNc                 S  s   g | ]
\}}|r
|nd qS Nr-   )r.   pokr-   r-   r1   rE   \   rP   )mkdirrangelenr   zip)
r5   r7   r8   r:   
tts_kwargstextsrQ   rR   pathsok_flagsr-   rO   r1   synth_turns_batchK   s"   r^   scendict	audio_dirc                 C  s   |  d| d }|  d}g }t| d D ]_\}}| dd s'|d q|dur/|| n|}| dd	}|| d
| d| d }	|	 rU|	 jdkrU||	 qt|| d
| d}
dd |
D }
||
rq|
d nd q|S )uG  Find pre-generated audio files for a scenario's turns.

    Matches filename pattern: {scenario_id}_t{turn_idx}_{emotion}.mp3
    Returns list aligned to scen['turns']; None if a turn's audio is missing
    or the text is empty.

    For per-turn dialogue splits, audio is named after the *original* scenario
    and turn index, not the new pseudo-scenario id. expand_split_dialogues
    sets `_source_scenario_id` and `_source_turn_indices` so we can reroute
    lookup to the original (sid, ti) here.

    SAFE: never calls TTS, never writes to audio_dir — read-only lookup.
    _source_scenario_idscenario_id_source_turn_indicesr5   rB    Nr)   r   _t_rN     z_*.mp3c                 S  s   g | ]}|  jd kr|qS )rh   )statst_size)r.   mr-   r-   r1   rE   }   s    z-lookup_audio_for_scenario.<locals>.<listcomp>r   )	rJ   	enumeratestripappendexistsri   rj   listglob)r_   ra   sidsrc_tisr\   local_titurn	actual_tiemoexpectedmatchesr-   r-   r1   lookup_audio_for_scenario_   s"   


rz   variantc           
      C  sz   ddl }ddl}|jdd ddlm} ddlm}m}m	}m
} td|  |j|dgd}| }	| ||	|||d	S )
ul  Lazy-load V2 ONNX session + feature extractor and pull the V2-dynamics
    helpers out of abc_experiment.py. Lazy import because abc_experiment.py
    imports `merge_lam_compiler` / `speech_gate` from THIS module — a
    module-level import would cycle.

    Returns a dict with keys: variant, sess, feat, run_v2, apply_v2_dynamics,
    get_preset_envelope.
    r   Nz-/dataset/text-to-face-se/LAM_Audio2Expression)AudioFeatureExtractor)ONNX_V2run_v2apply_v2_dynamicsget_preset_envelopezLoading V2 ONNX: CPUExecutionProvider)	providers)r{   sessfeatr~   r   r   )sysonnxruntimepathinsertdistillation.student_modelr|   scripts.compiler.abc_experimentr}   r~   r   r   LOGinfoInferenceSession)
r{   _sysortr|   r}   r~   r   r   r   r   r-   r-   r1   _load_v2_helpers   s   	r   	scenariosc              	   C  s~   g }| D ]8}| dd}|ds|| qt|d D ]\}}| dd s*q|| d| ||g|gd qq|S )u  Expand each daily_* dialogue scenario into one pseudo-scenario per
    non-empty turn (short monologue per turn).

    Pseudo-scenario shape:
        scenario_id:          "{original_sid}_t{turn_idx}"  (drives .npz name
                                                             + tremor/eye seed)
        turns:                [original turn]                (single-turn —
                                                              no transitions)
        _source_scenario_id:  original sid                   (audio lookup)
        _source_turn_indices: [original turn_idx]            (audio lookup,
                                                              filenames carry
                                                              the original ti)

    Rationale: the blendshape model only consumes (audio, VAD) → face at the
    rendering stage, and contextual emotion learning already lives in
    MicroAlbert (text + previous-turn context). So we drop the dialogue
    structure for this dataset and emit each turn as a self-contained short
    monologue. Long_/solo_ scenarios pass through unchanged — their multi-
    turn structure is what teaches inter-emotion transitions.
    rc   re   daily_r5   rB   rf   )rc   rb   rd   r5   )rJ   
startswithrn   rl   rm   )r   outsrr   tiru   r-   r-   r1   expand_split_dialogues   s"   


	r   >  P   wav
np.ndarraysrfpsn_melsc                 C  s:   t || }tjj| |||dd}t|j}|tjS )zKExtract mel features aligned to fps.

    Returns (T, n_mels) float32.
    i   )yr   r   
hop_lengthn_fft)	r;   librosafeaturemelspectrogrampower_to_dbTastypenpfloat32)r   r   r   r   r   mellog_melr-   r-   r1   mel_features   s   
r   rL   List[float]r   c                 C  sT   t j|dft jd}t| d}d|dd|f< t j|t jd|ddddf< |S )zD(T, 19) conditioning: 16-dim one-hot + 3 VAD, broadcast over frames.   dtyper         ?N   )r   zerosr   EMOTION_TO_IDXrJ   asarray)r)   rL   r   condidxr-   r-   r1   build_conditioning   s
    r   lam_bsc                 C  sj   d| dddf  d| dddf   d| dddf   d| dddf   }t |d d	dt jS )
u   Compute per-frame speech activity [0, 1] from LAM mouth activity.

    Per V3_IMPLEMENTATION_PLAN_v2 §3.4:
        activity = 1.2*jawOpen + 1.5*mouthClose + 1.0*mouthFunnel + 1.0*mouthPucker
    Normalized via sigmoid-ish.
    g333333?N         ?   r      %           )r   clipr   r   )r   activityr-   r-   r1   speech_gate   s   	r   prev_vfloatnext_vrD   c                 C  s   t d }|d| k r*d| dkr|d|  nd}ddt|tj   }d| |  S |d| k r2dS d| }|dkrB|d|  | nd}ddt|tj   }|| S )u  Brow channel value over a crossfade routed through the neutral (0)
    pose. Used when |delta| > BROW_SWING_DELTA so a sad↔anger inversion
    doesn't slide linearly between extremes.

    Same profile as abc_experiment.py:
        [0, 0.5−PAUSE/2]:  prev → 0 (cosine ramp-down)
        [0.5−PAUSE/2, 0.5+PAUSE/2]:  hold at 0
        [0.5+PAUSE/2, 1]:  0 → next (cosine ramp-up)
    r"         ?r   r   r   )NEUTRAL_PAUSE_FRACTIONr   cospi)r   r   rD   
half_pauselocal_teaseddenomr-   r-   r1   _brow_pass_through_zero   s   
r         >@r   r   r   signal
min_cutoffbetad_cutoffc                 C  s   dd }t | }tj|tjd}| d |d< d}d| }	td|D ]>}
||	|}| |
 ||
d   |	 }|| d| |  }|}||t|  }||	|}|| |
  d| ||
d    ||
< q!|S )zOne-Euro adaptive low-pass. Peak-preserving smoother for expression
    channels (not lipsync-critical). Same impl as abc_experiment.py.c                 S  s   dt j | |  }||d  S )N       @r   )r   r   )tecutoffrr-   r-   r1   sf
  s   z_one_euro_filter.<locals>.sfr   r   r   r   r   )rX   r   r   r   rW   abs)r   r   r   r   r   r   r   r   dx_prevr   r/   a_ddxdx_hatr   ar-   r-   r1   _one_euro_filter  s   

&r   targetc                 C  sh   |   }tttttdh B }|D ]}t|dd|f ||d|dd|f< qt|ddtj	S )zDApply One-Euro filter to expression channels (not lipsync-critical).r   Nr   r   r   r   )
copysortedsetr   r	   r   r   r   r   r   )r   r   r   result	smooth_chchr-   r-   r1   smooth_expression_channels  s   r   333333?皙?Q?
base_alphajitter_alphajitter_thresholdc           	      C  s   t | }|dkr| jtjddS tj|tjd}| d |d< td|D ],}tt| | t| |d   }||kr;|n|}|| |  d| ||d    ||< q#|S )a^  V2-style jitter-gate EMA. Small per-frame deltas get heavy smoothing
    (alpha=jitter_alpha); deltas above `jitter_threshold` pass through with
    light smoothing (alpha=base_alpha). Removes sub-threshold mouth jitter
    without flattening real phoneme transitions.

    Mirrors animasync-face-v2/pipeline_v2/smooth_v2.py::jitter_gate_smooth.
    r   T)r   r   r   r   )rX   r   r   r   r   rW   r   r   )	r   r   r   r   r   r   rD   deltaalphar-   r-   r1   _jitter_gate_smooth)  s    &r   c                 C  sb   |   }tttdhB }|D ]}t|dd|f |||d|dd|f< qt|ddtjS )ac  Apply V2 jitter-gate smoothing to LIPSYNC_ONLY + jawOpen (ch 24).

    The compiler+LAM teacher target carries sub-threshold high-frequency
    noise in the mouth/jaw channels that V3 then learns and amplifies.
    Smoothing the GT before training removes that noise floor while
    preserving real phoneme onsets (which exceed the jitter threshold).
    r   Nr   r   r   r   r   )	r   r   r   r   r   r   r   r   r   )r   r   r   r   r   lip_chr   r-   r-   r1   smooth_lipsync_channels@  s   r   `   fade_framesc                   s"  t j| ddt j}|d }d}t|dd D ]u\}}||7 }| | d | |d  d  td|| }t|jd || }	|	| }
|
dkrIq fddtD }t	||	D ]5}|| |
d  }d	d
t 
|t j   }d
|  |   ||< |D ]}t|  | ||||f< q|qXq|S )zCosine-eased blend across turn boundaries, with brow pass-through-zero
    on inverting (large-delta) brow channels. Mirrors abc_experiment.py.r   axisr"   Nr   c                   s0   g | ]}t t| t |  tkr|qS r-   )r   r   BROW_SWING_DELTA)r.   r   	next_pose	prev_poser-   r1   rE   e  s
     z-crossfade_turn_boundaries.<locals>.<listcomp>r   r   )r   concatenater   r   rl   maxminshapeBROW_CHANNELSrW   r   r   r   )
comp_stackturn_lengthsr   concathalfcursorr/   Ti
fade_startfade_endLbrow_pass_channelsfrD   r   r   r-   r   r1   crossfade_turn_boundariesU  s4   r  rR   presetssigmac                 C  s   t jdd | D t jd}t jdd | D t jd}| jd }t j|dft jd}dd|d	   }t|D ])}t j|| |  d	 d
d}	t |	 | }
|
 }|dkr\|
| }
|
| ||< q9t 	|dd
t jS )u   RBF over ALL preset anchors based on VAD distance — cross-emotion blend.
    Matches abc_experiment.py.cross_emotion_compile.c                 S  rA   rK   r-   r.   rT   r-   r-   r1   rE   x  rF   z)cross_emotion_compile.<locals>.<listcomp>r   c                 S  rA   )bsr-   r  r-   r-   r1   rE   z  rF   r   4   r   r   r"   r   r   g&.>r   )r   r   valuesr   r  r   rW   sumexpr   r   )rR   r  r  anchor_vads	anchor_bsr   r   	inv_2sig2rD   d2wr   r-   r-   r1   cross_emotion_compilet  s"   
r  ?turn_emotions	List[str]fleeting_scalec                   s   t | }|dks dkrdg| S dd | D }t t|dkr%dg| S d d  dg| }d}||k rh|}||k rT|| || krT|d7 }||k rT|| || ksDt||D ]}|| ||< qY|}||k s6 fdd|D S )u  For each turn, compute the magnitude scale based on emotion-family
    persistence across adjacent turns. Single-turn scenarios bypass entirely
    (returned scales are all 1.0). Same rule as abc_experiment.py:
       persistence == 1 → fleeting_scale
       persistence == 2 → midpoint(fleeting_scale, 1.0)
       persistence >= 3 → 1.0
    All-same-base monologues also bypass (sustained = full strength).
    r   r   c                 S  s   g | ]}t |d qS )r   )SUB_TO_BASErJ   )r.   r0   r-   r-   r1   rE     s    z.compute_persistence_scales.<locals>.<listcomp>r   r   c                   s(   g | ]}|d kr
 n|dkrndqS )r   r"   r   r-   r  r   pairedr-   r1   rE     s    )rX   r   rW   )r  r   nbasesrun_lenr/   jkr-   r"  r1   compute_persistence_scales  s*   



r)  lamcompgatec           
      C  s  | j d }t| }tD ]}| dd|f |dd|f< qtD ]}|dd|f |dd|f< qtD ]e}t| }|dkrkd|dd|f d d|   }| dd|f | |dd|f d|  d  |dd|f< q2|| dd|f  d| |dd|f   }|| d| |dd|f   |dd|f< q2|dddf |ddd	f  d }	|ddd
f  d|	d  9  < |dddf  d|	d  9  < |dddf  d|	d  9  < t|ddtj	S )z=Merge LAM lipsync + compiler expression per V3 channel rules.r   Nr   r   r   r   333333?+   ,   r   皙?   g?r,   r   )
r  r   
zeros_liker   r   r	   r   r   r   r   )
r*  r+  r,  r   r   r   r  emotion_gainblended_activesmiler-   r-   r1   merge_lam_compiler  s$   

 >,.$   r6  r-  ffffff?      @y&1?Fscenarioaudio_pathsr   out_pathpersistence_dampingcross_emotion_weightcross_emotion_sigmavad_damp_gammavad_damp_betavad_smooth_sigmablink_interval_soption_e_intensity
tremor_amptremor_sigma
v2_helperslipsync_smoothboollipsync_smooth_alphalipsync_smooth_jitter_alphalipsync_smooth_thresholdc           F      C  s  g }t | d D ]\}}|d  sq|| }|du s| s qtjt|ddd\}}t|dk r3q||}|jd }t	||t
d	}|jd |krR|d| }n!|jd |k rs||jd  }tj|t|d
d |dfgdd}t|}t|||t
d} d}!|dur|d |d |d ||d }!|!jd |kr|!d| }!n!|!jd |k r||!jd  }tj|!t|!d
d |dfgdd}!|||d t|d ||||| |!d	 q|sdS tdd |D |d}"|dkr6t|dkr6t|}#t|	}$tj|d d tjd}%|dd D ]'}&tj|&d tjd}'|#|' d|# |%  }(|$|% d|$ |'  }%|( |&d< qtjdd |D dd})tdd |D }*tj|*ttftjd}+d},|D ]}&t|&d d}-d|+|,|,|&d  |-f< |,|&d 7 },q[|
dkrt|dkrddlm}. |.|)|
ddd tj})|.|+|
ddd tj}+g }/g g }0}1d}2g }3t||"D ]\}&}4|&d }|&d }5|)|2|2|  }6|+|2|2|  }7|2|7 }2t |5g| |6||d!}8|d"krt!|6||d#}9t|}:d|: |8 |:|9  tj}8|4dk r	|8|4 tj}8|dur.|&d$dur.|d% |5|\};}<|d& |8|&d$ |d' |;|<d(}8|/|8 |3| |0|&d)  tj|7|6tjgd
dtj}=|1|= qt|3d*k}>|>rb|nt"|d+}?t|/dkrvt#|/|3|?d,}@n|/d }@tjd-d |D dd}Atjd.d |D dd}Bt$|A|@|B}C|>rd/nd0}Dt%|C|Dd1d2}C|rt&|C|||d3}Ct'|C| d4 t
|d5}C|d"kr|d"krtjd6d |D ddtj}Et(|C|E| d4 ||d7}Ctj)|tj|0ddtj|1dd|Cd8 dS )9u5  Two-pass scenario → .npz, mirroring abc_experiment.py's variant-C
    pipeline (compiler + LAM, no V2 ONNX). Defaults match the canonical
    lock-in (damp 0.65, xemo 0.2, blink 3.5, fade 96, σ=30, γ=0.3) — running
    with no flag overrides produces training targets that match what the
    viewer shows for _d65x20 scenarios.

    Pass 1: collect per-turn audio + LAM + speech gate + raw VAD.
    Persistence rule: pose-level scale per turn (multi-turn only).
    Causal VAD damping: pull each turn's VAD toward running mean of past.
    Cross-turn VAD smoothing: σ-frame Gaussian over per-frame VAD trajectory.
    Pass 2: per-turn compile (within-emotion + cross-emotion blend, then
    persistence damp).
    Crossfade between turn boundaries.
    Merge LAM, smooth expression channels, apply eye_motion.
    r5   rB   Nr   T)r   monog      @r   )r   r   r   r   r   )r   r~   r   r   r)   rL   )	turn_idxr)   rL   r   r   r   r,  silence_gatev2_bsFc                 S  rA   rH   r-   r.   cr-   r-   r1   rE   %  rF   z$process_scenario.<locals>.<listcomp>)r   r   r   c                 S  s0   g | ]}t t j|d  t jd|d dfqS )rL   r   r   r   )r   tiler   r   rQ  r-   r-   r1   rE   5  s    "c                 s  s    | ]}|d  V  qdS )r   Nr-   rQ  r-   r-   r1   	<genexpr>=  s    z#process_scenario.<locals>.<genexpr>r   )gaussian_filter1dnearest)r  r   mode)rQ   rR   r  parametric_overlay_intensityr   )r  rP  r   r   r{   )envelope_loenvelope_hir   r#      )r   c                 S  rA   )r   r-   rQ  r-   r-   r1   rE     rF   c                 S  rA   )r,  r-   rQ  r-   r-   r1   rE     rF   r0  r   r   r   r   rc   )seed_strr   rC  c                 S  rA   )rO  r-   rQ  r-   r-   r1   rE     rF   )rO  rc   ampr  )audior   r   )*rl   rm   ro   r   loadr9   rX   infer_audior  r   FPSr   r   rS  r   r   rn   rp   r)  r   arrayr   tolistr  r   EMOTION_LABELSr   rJ   scipy.ndimagerU  r   rY   r
   r  r   r  r6  r   r   r   r   savez_compressed)Fr:  r;  r*  r  r<  r=  r>  r?  r@  rA  rB  r   rC  rD  rE  rF  rG  rH  rJ  rK  rL  	collectedrN  ru   
audio_pathr   r   r   r   r   padr,  sgaterP  persist_scales   γ   βrunning_meanrR  rawdampedall_vadsn_totalall_emos_c_emor   rU  r  audio_featscondsr  r  psrw   	vad_slice	emo_slicecomp_bsxemor  env_loenv_hicond_per_frameis_monologuefade_for_thiscomp_catlam_catgate_catr   r   	sgate_catr-   r-   r1   process_scenario  sL  '

&







	r  c               	     s  t  } | jdttd | jdttd | jdttdd | jdtdd	d | jd
ttd dd | jdd d | jdddd | jdddd | jdt	ddd | jdt	ddd | jdt	ddd | jd t	d!d"d | jd#t	d$d%d | jd&t	d'd(d | jd)td*d+d | jd,t	d-d.d | jd/t	d0d1d | jd2t	d3d4d | jd5t	d6d7d | jd8dd9d | jd:dd;d | jd<t	d=d>d | jd?t	d@dAd | jdBt	dCdDd | jdEg dFdGdHdI | 
 }|jrdJ|_tjtjdKdL |jjdMdMdN |j s
tdO|j tdP|j  |jr,|j r,tdQ|j  t|j}ntdR t }tdSt| dT g }|j }|D ]}|t| qIW d    n	1 s`w   Y  tdUt| dV |j ! " }|r|dWvrt#dXdY |$dZD  |j%rt&d[dY  D s d\  td] t|} fd^d_|D }td`  da| dbt| dc |j%rt|}t'|}t(dddY |D }tde| dbt| df| dg |j)r|d |j) }tdht| dc g }	d}
|D ]t*|j}|	| |
t(fdidYt+|D 7 }
qt(djdY |D }||
 }tdk| dl| dm|
 dn |dkrEtdotdp t,|j-dq}d }|j.drkrgt/|j.}tds|j.  ntdt ddul0m0} d}t+||dvdwD ]l\}|jdx  dy }| r|dz7 }q|t1|	| |||fi d{|j2d||j3d}|j4d~|j5d|j6d|j7d|j8d|j9d|j:d|jd|j;d|d|j<d|j=d|j>d|j?}|r|dz7 }q|td| dlt| d td|j  d S )Nz--scenarios)typedefaultz--outputz--audio_dirz5Directory with pre-generated audio (read-only lookup))r  r  helpz--limitr   z0 = allz--presets_jsonzexpression_presets.jsonzUser-authored preset JSON. Defaults to <project_root>/expression_presets.json (the same file abc_experiment.py uses). Falls back to synthetic bootstrap if the file doesn't exist.z--device)r  z--filter-prefixzlong_,solo_zComma-separated scenario_id prefixes to keep. Default 'long_,solo_' (monologues + single-turn). Use 'all' or '' to disable filtering and process every scenario. When --split-dialogues is set, 'daily_' is auto-added if missing.)r  r  z--split-dialogues
store_trueu  Expand each daily_* dialogue into one .npz per turn (short monologue per turn). Audio lookup is rerouted to the original (sid, turn_idx). Dialogue context learning is left to MicroAlbert; this pipeline only consumes (audio, VAD) → face at the render stage.)actionr  z--persistence-dampingr  ziPose-level scale for fleeting (multi-turn isolated) emotions. Single-turn scenarios bypass. Default 0.65.z--cross-emotion-weightr&   z_Weight for cross-emotion VAD-distance blend (0=pure within-emotion, 1=pure cross). Default 0.2.z--cross-emotion-sigmar%   u?   Gaussian σ for cross-emotion VAD-distance kernel. Default 0.4.z--vad-damp-gammar-  u#   Causal VAD damping γ. Default 0.3.z--vad-damp-betar7  u#   Causal VAD damping β. Default 0.7.z--vad-smooth-sigmar   u<   Cross-turn VAD trajectory Gaussian σ in frames. Default 30.z--fade-framesr   z>Crossfade duration at turn boundaries (monologue). Default 96.z--blink-intervalr8  z3Mean seconds between blinks (Poisson). Default 3.5.z--option-e-intensityr   uC   α scalar for Option E parametric mouth/cheek overlay. Default 1.0.z--tremor-ampr9  zcBrow + eyeSquint tremor amplitude. Default 0.014 (matches viewer runtime tremor). Set 0 to disable.z--tremor-sigmar   uR   Gaussian σ for tremor noise smoothing (frames). Default 1.5 → ~2.2 Hz dominant.z--no-tremorz/Disable tremor baking entirely (clean targets).z--lipsync-smoothzApply V2-style jitter-gate EMA to LIPSYNC_ONLY + jawOpen channels of the teacher target before saving. Removes sub-threshold mouth noise V3 would otherwise learn. Mirrors animasync-face-v2/pipeline_v2/smooth_v2.py.z--lipsync-smooth-alphar   z8EMA alpha for above-threshold frame deltas. Default 0.6.z--lipsync-smooth-jitter-alphar   zWEMA alpha for sub-threshold (jitter) frame deltas. Lower = more smoothing. Default 0.1.z--lipsync-smooth-thresholdr   u]   Frame-delta cutoff: |Δ|>threshold → real motion, |Δ|<=threshold → jitter. Default 0.03.z	--variant)ABCr  uy  V2-dynamics teacher variant. C = compiler only (no V2). A = strict V2 mask (brows + cheek/nose squint + eyeSquint). B = tiered V2 mask: A's channels PLUS mouth smile/frown, eye wide, mouth dimple at α=0.25. Default B — V3 learns to reproduce V2's prosody-driven motion from (audio, VAD) alone. V2 ONNX is loaded only for A/B (data generation only; not used at V3 inference).)choicesr  r  r   z&%(asctime)s %(levelname)s: %(message)s)levelformatTr>   zAudio dir not found: zAudio source (read-only): zLoading user presets from z>Using synthetic preset bootstrap (parametric layer on anchors)u     → z presetszLoaded z scenarios (pre-filter))all*c                 s  s     | ]}|  r|  V  qd S rS   )rm   r  r-   r-   r1   rT  %  s    zmain.<locals>.<genexpr>,c                 s  s    | ]}| d V  qdS )dailyN)r   r  r-   r-   r1   rT  &  s    )r   z;--split-dialogues set: auto-added 'daily_' to filter prefixc                   s"   g | ]}| d d r|qS )rc   re   )rJ   r   r.   r   )prefixesr-   r1   rE   *  s    
zmain.<locals>.<listcomp>zFilter prefixes=z: u    → z
 scenariosc                 s  s    | ]
}| d rdV  qdS )rb   r   NrI   r  r-   r-   r1   rT  6  s    z--split-dialogues: z scenarios (z  per-turn splits from dialogues)z--limit applied: c                 3  s8    | ]\}}|d u r d |  dd rdV  qd S )Nr5   rB   re   r   rJ   rm   )r.   r   rT   )r_   r-   r1   rT  D  s     c                 s  s&    | ]}t d d |d D V  qdS )c                 s  s$    | ]}| d d rdV  qdS )rB   re   r   Nr  rC   r-   r-   r1   rT  I  s   " z!main.<locals>.<genexpr>.<genexpr>r5   N)r  r  r-   r-   r1   rT  H  s
    
zAudio lookup: /z turns have audio (u)    missing — those turns will be skipped)z3No audio found for any scenario. Check --audio_dir.zLoading LAM model...)devicer  u   V2 teacher ENABLED — variant=z6V2 teacher DISABLED (variant=C, compiler-only targets))tqdmzProcess scenarios)descrc   z.npzr   r=  r>  r?  r@  rA  rB  r   rC  rD  rE  rF  rG  rH  rJ  rK  rL  zDone. z" scenarios successfully processed.zOutput: )@argparseArgumentParseradd_argumentr   	SCENARIOS
OUTPUT_DIR	AUDIO_DIRr;   PROJECT_ROOTr   
parse_args	no_tremorrE  loggingbasicConfigINFOoutputrV   ra   ro   
SystemExitr   r   presets_jsonr   r   rX   r   openrn   jsonloadsfilter_prefixrm   lowertuplesplitsplit_dialoguesanyr   r  limitrz   rl   r   r  r{   r   r  r  r=  r>  r?  r@  rA  rB  r   blink_intervalrD  rF  rH  rJ  rK  rL  )apargsr  r   r  line
prefix_strbeforen_splitscenario_audio_pathsmissing_totalr\   total_turnsfound_turnsr*  rG  r  successsir<  rU   r-   )r  r_   r1   main  sj  














	

"







	
r  __main__)r4   r$   )
r5   r6   r7   r   r8   r9   r:   r;   r<   r=   )r_   r`   ra   r   r<   r=   )r{   r9   )r   r6   r<   r6   )r   r,   r   )
r   r   r   r;   r   r;   r   r;   r<   r   )r)   r9   rL   r   r   r;   r<   r   )r   r   r<   r   )r   r   r   r   rD   r   r<   r   )r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r<   r   )r   r   )r   r   r   r   r   r   r<   r   )r   r   r   )
r   r   r   r   r   r   r   r   r<   r   )
r   r   r   r   r   r   r   r   r<   r   )r   )r   r;   r<   r   )r%   )rR   r   r  r`   r  r   r<   r   )r  )r  r  r   r   r<   r   )r*  r   r+  r   r,  r   r<   r   )r  r&   r%   r-  r7  r   r   r8  r   r9  r   NFr   r   r   ),r:  r`   r;  r=   r*  r   r  r`   r<  r   r=  r   r>  r   r?  r   r@  r   rA  r   rB  r   r   r;   rC  r   rD  r   rE  r   rF  r   rG  r`   rH  rI  rJ  r   rK  r   rL  r   r<   rI  )G__doc__
__future__r   r  asyncior  r  pathlibr   typingr   r   numpyr   torch	constantsr   r   r   r	   
expressiver
   
eye_motionr   lam_wrapperr   tremorr   r   ttsr   utilsr   r   r!  r  r   r   	getLoggerr   __file__resolver?   r  r  r  r  ra  rd  rl   r   r^   rz   r   r   r   r   r   r   r   r   r   r   r  r  r)  r6  r  r  __name__runr-   r-   r-   r1   <module>   s    


#
'

	
 
 & r 
H