o
    j$                     @  s   d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	Z
edZee jd ZdddZdddZdd ZedkrGe  dS dS )u5  Convert dataset .npz files (from data_pipeline.py output) into viewer
format so they can be loaded in tools/blendshape-player.html for visual QA.

NEW DEFAULT (2026-05): output goes to `data/viewer_dataset/` (the FULL
archive of every baked .npz). The active player folder `data/viewer/` is
populated separately by `scripts/curate_viewer.py`, which copies a curated
subset (all long_, a sample of solo_/daily-split) so the player dropdown
stays uncluttered.

For each .npz, builds:
  - <viewer_dir>/<sid>_dataset_{A,B,C}.json  — three identical files (player
    expects A/B/C variants; for dataset preview they all carry the same
    target trajectory — dataset has no variant split)
  - <viewer_dir>/<sid>_dataset_ABC.mp3       — concatenated turn audio
  - <viewer_dir>/<sid>_dataset_{A,B,C}.mp3   — symlinks to the above
  - manifest.json entry under base "<sid>_dataset"

NOTE: data_pipeline.py bakes brow + eyeSquint tremor into the .npz targets
(and variant B also bakes V2 prosodic motion). When previewing in
blendshape-player.html, all runtime overlays auto-disable for `_dataset_`
scenarios — what you see IS the .npz.

Usage:
    PYTHONPATH=. python3 scripts/dataset_to_viewer.py             # convert all in v3_training/
    PYTHONPATH=. python3 scripts/dataset_to_viewer.py -s long_001 long_046    # specific
    )annotationsN)Pathz^(daily_.+)_t(\d+)$   scendict	audio_dirr   return
list[Path]c                 C  s   |  d}|d ur| d n| d }g }t| d D ]T\}}| dd s&q|d ur.|| n|}| dd}|| d	| d
| d }	|	 rT|	 jdkrT||	 qdd || d	| dD }
|
rm||
d  q|S )Nsource_turn_indicessource_scenario_idscenario_idturnstext emotionneutral_t_.mp3  c                 S  s   g | ]}|  jd kr|qS )r   )statst_size).0m r   scripts/dataset_to_viewer.py
<listcomp>D       z(find_audio_for_turns.<locals>.<listcomp>z_*.mp3r   )get	enumeratestripexistsr   r   appendglob)r   r   src_tissidoutlocal_titurn	actual_tiemoexactmatchesr   r   r   find_audio_for_turns.   s    
 r-   pathsout_mp3Nonec                 C  s   | d}|d}| D ]}|d|  d qW d    n1 s%w   Y  tjddddd	d
dddt|ddddt|gdd |  d S )Nz.concat.txtwzfile 'z'
ffmpegz-yz	-loglevelerrorz-fconcatz-safe0z-iz-c:a
libmp3lamez-b:a128kT)check)with_suffixopenwriteresolve
subprocessrunstrunlink)r.   r/   	list_filefpr   r   r   concat_audioK   s   
rD   c                    sD  t  } | jdttd d | jdttd d | jdttd d | jdttd	 d
d | jddd | jdddd dd |  }ddlm} i dD ]1}|jj	| }|
 sYqL| }|D ]}t|}||d < q`W d    n1 sxw   Y  qL|jr|j}ntdd |jdD }tdt| d |jjddd |jd }	|	
 rt|	 nd g i}
dFfd%d&}d}|D ]}|j| d' }|
 std(| d) q||}|d u rtd(| d* qt|}|d+ }|jd }||j  t||j}|j  d, }|rt|| d-d. t|d/ D } d0t|||t |d1! d2}|j  d3 }|"tj#|d4d5  fd6d.|
d  D |
d < |
d  $  g t|d7d. |D |rj|d %d8d9nd9d d: d; |d<7 }td=  d>| d?t| d@ q|	"tj#|
d4dAdB tdC| dDt| dE d S )GNz	--npz_dirzdata/v3_training)typedefaultz--scenarios_jsonlz#data/emotion/seed_train_final.jsonlz--audio_dirzdata/audio_previewz--viewer_dirzdata/viewer_datasetzWOutput dir for the full archive. Curate into data/viewer/ via scripts/curate_viewer.py.)rE   rF   helpz--suffix_dataset)rF   z-sz--scenarios*z;Specific scenario IDs to convert. Default: all .npz in dir.)nargsrF   rG   r   )ARKIT_52_NAMES)zseed_train_final.jsonlzseed_val.jsonlzseed_test.jsonlzseed_split_dialogues.jsonlr   c                 s  s    | ]}|j V  qd S )N)stem)r   rC   r   r   r   	<genexpr>|   s    zmain.<locals>.<genexpr>z*.npzzconverting z
 scenariosT)parentsexist_okzmanifest.json	scenariosr%   r?   r   dict | Nonec              
     s   |  v r |  S t | }|sdS |d|d}} |}|du r'dS z
|d t| }W n tttfy>   Y dS w | |t|g|gdS )ar  Resolve a .npz stem to its scenario metadata. Direct hit in the
        loaded JSONLs first; on miss, try to synthesize a per-turn dialogue
        split by stripping a `_tK` suffix and looking up the parent.

        Robust to the sidecar `seed_split_dialogues.jsonl` being missing or
        only containing entries from the most recent --split-dialogues run.
        Nr      r   )r   r   r
   r   )	_SPLIT_REmatchgroupr   intKeyError
IndexError
ValueError)r%   r   src_sid
src_ti_strparentr(   )scenario_lookupr   r   resolve_scenario   s&   

zmain.<locals>.resolve_scenarioz.npzu     ✗ z: no .npz foundz: not in scenario JSONLtargetr   c              
   S  sT   g | ]&\}}| d d r|| dd| dg d| d d| dddqS )r   r   r   r   vad)r   r   r   speaker)turn_idxr   r`   r   ra   )r   r    )r   titr   r   r   r      s    


zmain.<locals>.<listcomp>r         )r   fps
num_framesnamesr   blendshapesz.jsonF)ensure_asciic                   s   g | ]}| d  kr|qS )base)r   )r   s)new_baser   r   r      r   c                 S  s   g | ]}|d  qS )r   r   )r   rd   r   r   r   r      s    r   r   2   )rl   r   variantsn_turnsemotionstext_previewr   u     ✓ z  (z	 frames, z turns)rR   )rk   indentz
manifest updated. converted /z scenarios.)r%   r?   r   rQ   )&argparseArgumentParseradd_argumentr   PROJECT_ROOT
parse_argsscripts.compiler.constantsrK   scenarios_jsonlr\   r!   r:   jsonloadsrP   sortednpz_dirr#   printlen
viewer_dirmkdir	read_textnploadshapesuffixr-   r   rD   r   rV   roundtolist
write_textdumpsr"   r   )apargsrK   
jsonl_namerC   rB   linerm   sidsmanifest_fpmanifestr^   	convertedr%   npz_pathr   datar_   Taudio_pathsr/   
turns_metaviewer_jsonjson_outr   )rn   r]   r   mainX   s   







	 "r   __main__)r   r   r   r   r   r	   )r.   r	   r/   r   r   r0   )__doc__
__future__r   rv   r}   rer=   pathlibr   numpyr   compilerS   __file__r<   rN   ry   r-   rD   r   __name__r   r   r   r   <module>   s"   


 
