o
    j                     @  s   d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
 e
e jd Zd ddZd!ddZd"ddZd#ddZedkrJe  dS dS )$uJ  Curate a subset from the full dataset viewer archive into the active
player folder.

Reads from `data/viewer_dataset/` (full archive written by
dataset_to_viewer.py) and copies a curated selection of scenarios into
`data/viewer/` so the player dropdown stays focused. Rebuilds
`data/viewer/manifest.json` with only the curated entries.

Default selection (deterministic with --seed):
  - longs:   ALL    (multi-turn monologues — main signal for emotion arcs)
  - solos:   30     (sampled, diversified across emotion classes)
  - dailies: 20     (per-turn dialogue splits, diversified across emotions)

Usage:
    python3 scripts/curate_viewer.py                  # default selection
    python3 scripts/curate_viewer.py --solos 50 --dailies 0
    python3 scripts/curate_viewer.py --clean          # also remove old *_dataset_* from viewer/
    )annotationsN)defaultdict)Path   sidstrreturn
str | Nonec                 C  s6   |  drdS |  drdS |  drd| v rdS d S )Nlong_solo_daily__tdaily_-split
startswith)r    r   scripts/curate_viewer.py	_classify    s   

r   entries
list[dict]nintrngrandom.Randomc           
      C  s   |dks| sg S |t | krt| S tt}| D ]}|dp!dg}||d  | qt|}|| |D ]	}|||  q7g }t ||k rrd}	|D ]}|| rf|||   d}	t ||krf nqM|	sl	 |S t ||k sI|S )z>Round-robin sample n entries across distinct primary emotions.r   emotionsneutralFT)lenlistr   getappendshufflepop)
r   r   r   by_emoeemoskeyskpicks
progressedr   r   r   _sample_diverse*   s6   

r)   sdictarchiver   viewerc                 C  sp   | d }d}|| d }|  rt|||j  |d7 }|| d }|  r6t|||j  |d7 }|S )u   Copy one scenario's JSON + MP3 from archive → viewer. Returns count
    of files written. One JSON and one MP3 per scenario — no A/B/C duplication.scenario_idr   z.jsonr   z.mp3)existsshutilcopy2name)r*   r,   r-   basewrittensrc_jsonsrc_mp3r   r   r   _copy_scenarioF   s   r7   Nonec                  C  s  t jtd} | jdttd d dd | jdttd d d	d | jd
tddd | jdtddd | jdtddd | jdtdd | jdddd |  }t	|j
}|jd }| sftd| dt| }g g g d}|dg D ]}|dd }|d!d }t|}	|	r||	 | qyg }
|jdks|jt|d" kr|
|d"  n|
t|d" |j| |
t|d# |j| |
t|d$ |j| |
std%t|d"  d&t|d#  d't|d$  |jjd(d(d) |jrCd}|jd*D ]}z|  |d+7 }W q  t y   Y q w |jd,D ]}z|  |d+7 }W q t y7   Y qw t!d-| d.|j  d}|
D ]}|t"||j|j7 }qG|jd }| rrt| }d/d0 |dg D }ng }||
 |#tj$d|id1d2d3 t%d4d5 |
D }t%d6d5 |
D }t%d7d5 |
D }t!d8| d9| d:| d;t|
 d<| d= t!d>|j  t!d?|j  d S )@N)descriptionz	--archivedataviewer_datasetz2Full archive folder (dataset_to_viewer.py output).)typedefaulthelpz--viewerr-   z&Active player folder (curated subset).z--longsr   z,How many long_ samples (0 = all). Default 0.z--solos   z#How many solo_ samples. Default 30.z	--dailies   z*How many daily_-split samples. Default 20.z--seed*   )r<   r=   z--clean
store_truez\Remove existing *_dataset_* files from viewer BEFORE curation (recommended after re-baking).)actionr>   zmanifest.jsonzArchive manifest not found: z 
Run dataset_to_viewer.py first.)r
   r   r   	scenariosr.    _datasetr
   r   r   z)No scenarios matched. Archive has: long_=z, solo_=z, daily_-split=T)parentsexist_okz*_dataset_*r   z*_dataset_*.concat.txtz--clean: removed z *_dataset_* files from c                 S  s"   g | ]}d | dpdvr|qS )rF   r3   rE   )r   ).0r#   r   r   r   
<listcomp>   s    zmain.<locals>.<listcomp>F   )ensure_asciiindentc                 s  "    | ]}|d   drdV  qdS )r.   r
   r   Nr   rI   r*   r   r   r   	<genexpr>        zmain.<locals>.<genexpr>c                 s  rN   )r.   r   r   Nr   rO   r   r   r   rP      rQ   c                 s  s.    | ]}|d   drd|d  v rdV  qdS )r.   r   r   r   Nr   rO   r   r   r   rP      s    z	curated: z long, z solo, z daily-split = z scenarios, z files writtenzviewer: z	archive: )&argparseArgumentParser__doc__add_argumentr   PROJECT_ROOTr   
parse_argsrandomRandomseedr,   r/   
SystemExitjsonloads	read_textr   replacer   r   longsr   extendr)   solosdailiesr-   mkdircleanglobunlinkOSErrorprintr7   
write_textdumpssum)apargsr   archive_manifest_fparchive_manifestbucketsr*   r   base_for_classcatselected	n_removedptotal_writtenexisting_manifest_fpexistingkeptn_longn_solon_dailyr   r   r   mainV   s   












r~   __main__)r   r   r   r	   )r   r   r   r   r   r   r   r   )r*   r+   r,   r   r-   r   r   r   )r   r8   )rT   
__future__r   rR   r\   rX   r0   collectionsr   pathlibr   __file__resolverG   rV   r   r)   r7   r~   __name__r   r   r   r   <module>   s    




b
