o
    xjjW                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
 edZG dd deZed	d
Zg dg dg dg dg ddZg dZddddddZddddddZdd Zi ddddd dd!dd"dd#dd$d$d%d$d&d$d'd$d(d$d)d)d*d)d+d+d,d+d-d+Zed.Z			dldmd6d7Zdnd:d;Zdod=d>Zed?d@ZdpdBdCZdqdrdGdHZdsdKdLZ 					dtdudUdVZ!dvdwdYdZZ"	[	\					dxdydjdkZ#dS )zu  TTS backends for data pipeline.

Two implementations:
    - ElevenLabs (paid, emotional prosody) — recommended
    - edge-tts (free fallback)

Env vars:
    ELEVENLABS_API_KEY  (required for elevenlabs)
    ELEVENLABS_VOICE_ID (optional, default to multilingual-friendly voice)
    )annotationsN)Path)Listttsc                   @  s   e Zd ZdZdS )QuotaExhaustedu   Raised when ElevenLabs returns 401/402/quota — non-retryable.
    synth_all catches this once and aborts the whole batch so the user
    can see a single clear message and re-run with a different key.N)__name__
__module____qualname____doc__ r   r   L/dataset/kemix-engine/package/face/animasync-face-v3/scripts/compiler/tts.pyr      s    r   ELEVENLABS_MODEL	eleven_v3)FCdKzv68Ofr4VUDcZXIyKsAmBSHXsuxsZ1lZZXlGoHB9Xhox1bqMl1TvkmelZubHeGTOAkECknc02Zmo)m3yAHyFEFKtbCIM5n7GFk9073AMdU5sAUtPMH1ilhWXqitL3DEOLD49pgNWRz2P4oCxSHhXan3ew4COv)TbMNBJ27fH2U0VgpSNko1W00IGEmNmwmsDeYy7agOEaq3WGNtNvFJ5co9mJExi3rF0t7dg7uN2M0WUhr)8tsLeAV5vPVuzCCvqbbUs0XGIcqmceN2l7kjsqoZiWLjl1zCuqXRkW6494ve8QclKarwLctvzN2plke3)XfNU2rGpBa01ckF309OY93nuHbke4dTER9x2pDwEYDDaC9XKjODs7hY78qEWBNr4zvrC1bGIdIstzjFQ)angersadnessjoyneutralsurprise)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    c           
        s   dd | D }|sdS ddl m} | }i  |D ]:}|dp d}t|d}||  d7  < |dp7g d	}t|d
krFtt|d nd} |d|  |< qt| fdd|	 D }	t|	dkrn|	d S t|	 fdddS )u  Pick the most-frequent base emotion across a scenario's turns.

    `turns` is a list of dicts with 'emotion' (and optionally 'vad').
    Returns the base name ('joy', 'sadness', etc.). Tie-break: highest
    summed |arousal| of the tied bases. Empty input → 'neutral'.
    c                 S  s"   g | ]}| d p
d r|qS )text )getstrip).0tr   r   r   
<listcomp>p   s   " z+dominant_base_for_turns.<locals>.<listcomp>r&   r   )Counteremotion   vad)        r3   r3      r3   c                   s   g | ]
\}}| kr|qS r   r   )r,   bc)	top_countr   r   r.   ~   s    c                   s     | dS )Nr3   )r*   )r5   )arousal_sumr   r   <lambda>   s    z)dominant_base_for_turns.<locals>.<lambda>)key)
collectionsr/   r*   EMOTION_TO_BASElenabsfloatmaxvaluesitems)
turnsrealr/   countsr-   emobaser2   atiedr   )r8   r7   r   dominant_base_for_turnsg   s$   	 rJ   r&   r%   laughter
excitement	agreement	gratituder$   cryingsulkapologystruggler#   refusalr'   flustershyELEVENLABS_VOICE_IDr0   
str | Noneoverrideseedpool_overridereturnstrc                 C  s~   |r|S t rt S |dkrt}n|rt| }nt| pdd}t| }|du r*|d S tt|d	 d}||t
|  S )u  Pick voice ID from pool keyed by emotion → base emotion.

    seed: stable key (e.g. text or scenario_id+turn_idx) to deterministically
    pick a voice from the pool. Without seed, picks first voice in pool.
    Deterministic selection ensures same turn always gets same voice on re-run.

    pool_override: force voice selection from a specific base-emotion pool
    (e.g. 'neutral' for monologues — same voice across all emotions in turn).
    female_monologuer&   Nr   utf-8   )SINGLE_VOICE_OVERRIDEFEMALE_MONOLOGUE_POOLVOICE_POOLS_BY_BASEr<   r*   inthashlibmd5encode	hexdigestr=   )r0   rX   rY   rZ   poolrG   hr   r   r   voice_id_for_emotion   s   
rj   r2   list | Nonec                 C  sv   | du rdddddS | \}}}t t|}ttdtdd	d
|  }ttdtddd|  }d}|||ddS )u  Map VAD → ElevenLabs voice_settings for expressive prosody.

    For v3 + audio tags: tags carry the emotion, voice_settings carry
    consistency. Stacking high `style` on top of expressive tags
    triple-amplifies expression and produces distortion (raspy / accent
    drift / foreigner-sounding) — that was the failure mode observed on
    2026-05-07. We keep style modest and stability solid.

    Intensity is |A| magnitude — both high-positive (excited) and
    low-negative (sad) arousal scale prosody equally.
    Ng?g      ?g333333?T)	stabilitysimilarity_booststyleuse_speaker_boostr3   g      ?g?g      ?g?g?)r>   r?   r@   min)r2   VAD	intensityrn   rl   rm   r   r   r   _build_voice_settings   s   
ru   r(   c                 C  s$   |   } | r| d dvr| d S | S )uF  Ensure text ends with sentence-ending punctuation so v3 doesn't cut the tail.

    Note: `、` (U+3001) is a Japanese reading-comma, not a Korean sentence
    terminator — it was here by mistake. Korean uses ASCII `.?!` plus
    sometimes `…`. CJK terminators `。？！` are kept as a safety net for
    pasted text.
    u   .?!。？！….)r+   )r(   r   r   r   _ensure_trailing_punct   s   rx   reu9   [ㅠㅜ]{2,}|[ㅋㅎ]{2,}|[ㅏㅓㅗㅡㅣ]{2,}|[ㅡ]{2,}tuple[str, bool, bool]c                 C  sn   ddl }t|d| }t|d| }|dd| } |dd| } |dd| } |d	d|  } | ||fS )
ud  v3 reads bare Korean jamo runs (ㅠㅠ, ㅋㅋ, ㅎㅎ) literally — comes out
    as "어어/유유/크크" instead of sobbing/laughing. Replace with v3 audio
    tags so they render as real reactions.

    ㅠㅠ / ㅜㅜ → [crying]    (canonical v3 demo tag — fires reliably
                              where [sniffles] was getting ignored
                              mid-Korean-sentence)
    ㅋㅋ / ㅎㅎ → [laughs]    (canonical v3 demo tag — fires reliably
                              where [chuckles] was getting ignored)
    bare vowel runs (ㅏㅏ, ㅡㅡ…) → stripped (garbage)

    No leading period — that produced a hard sentence break before the
    reaction (audible "speak…long pause…laugh" instead of "speak laugh").
    The canonical tags fire mid-clause without it.

    Returns (cleaned_text, had_sob, had_chuckle) — flags reserved but
    head-tag suppression is OFF by default now. Head tag carries the
    emotional prosody for the whole line; the body tag just stamps the
    reaction sound. Suppressing the head was making the voiced emotion
    drop out entirely.
    r   Nu   [ㅠㅜ]{2,}u   [ㅋㅎ]{2,}z
 [crying] z
 [laughs] u   [ㅏㅓㅗㅡㅣ]{2,} z\s+)ry   boolsearchsubr+   )r(   r0   ry   had_sobhad_chuckler   r   r   _scrub_korean_emoticons   s   
r      	min_charsrc   c                 C  s"   t |  |k r| dd S | S )u   v3 fails / hallucinates on very short inputs (e.g. "응.", "네.").

    Pad with neutral filler so the model has enough context. We add an
    ellipsis-style trailer that v3 reads as a brief pause, not extra speech.
    rw   z...)r=   r+   rstrip)r(   r   r   r   r   _pad_short_text  s   r   use_v3_tagsr|   c                 C  s   ddl }|dd| } t| |\} }}t| } t| } |r#|r#|dkr%| S dddddd	d
ddddddddd}||dp>d}|rG|dv rGd}|rO|dv rOd}|r[d| d|   S | S )ux  Prepend emotion tag for v3 models (supports bracketed audio tags).

    Adds a leading period buffer before the tag — v3 sometimes eats the first
    token after an audio tag, which truncates the first word of real content.
    Also ensures trailing punctuation so the model doesn't cut the final word.
    Short texts are padded — v3 hallucinates on <12-char inputs.
    r   Nz
\[[^\]]*\]r{   r&   z	[happily]z[laughs]z	[excited]z[sorrowful]z[crying]z[sighs]z[resigned tone]z[frustrated]z[angry]z[flatly]z[surprised]z
[stammers]z
[whispers])r%   rK   rL   rM   rN   r$   rO   rP   rQ   rR   r#   rS   r'   rT   rU   r)   )rK   r%   rL   )r$   rO   rP   rQ   z. )ry   r~   r   r   rx   r*   r+   )r(   r0   r   ry   r   r   tag_maptagr   r   r   _format_text_with_emotion  s:   
r   out_pathr   voice_idmodelapi_keylist
voice_seed
voice_poolc	                   s  ddl m  t||pj|dptptds#tdd v }	t	| ||	t
|t }
 fdd}dd	l}d
}t|D ]	}z|
d	|I d	H  W  d	S  tyW } zt| dv pdv pdv pdv pdv pdv pdv }|rz#tdjd dt|j dt|dd dt|dd d	dd W n	 ty   Y nw tdd	d  |dv pd v }d!v pd"v pd#v }tfd$d%d&D }tfd'd%d(D }|p|p|p|}|rR||d) k rR|rd*nd+}td,|d-|  |dd. }|rd!n|r!dn|r&d/nd0}td1| d2d	d3  d4|d)  d5| d6|d7d8 t|I d	H  W Y d	}~qM d	}~ww d	S )9u2  Synthesize one clip with ElevenLabs, using emotion+VAD for prosody.

    If voice_id is None, picks voice automatically based on emotion → base mapping.
    voice_seed: stable key for deterministic voice pick from the pool.
    voice_pool: override base-emotion pool (e.g. 'neutral' for monologues).
    r   )
ElevenLabs)rX   rY   rZ   ELEVENLABS_API_KEYz#ELEVENLABS_API_KEY env var requiredv3c                    s   dd l }  d}|jjdd}d}|d}|D ]}|| q!W d    n1 s3w   Y  z!| jddd	d
dt|ddddddtgdd |jdd W d S  t	| j
fy} } ztd| d | W Y d }~d S d }~ww )Nr   )r   mp3_44100_128)r   r(   model_idoutput_formatvoice_settingsz.raw.mp3wbffmpegz-yz	-loglevelerrorz-iz-afz adelay=180|180,apad=pad_dur=0.18z-c:a
libmp3lamez-b:a128kT)check)
missing_okz[tts] ffmpeg pad failed (z); saving raw audio)
subprocesstext_to_speechconvertwith_suffixopenwriterunr\   unlinkFileNotFoundErrorCalledProcessErrorLOGwarningreplace)r   clientaudioraw_pathfchunke)r   r   formatted_textr   r   r   r   r   r   _sync^  s>   


z#synth_one_elevenlabs.<locals>._syncN   401402403quotaunauthorizedpayment_requiredbillingz/tmp/tts_quota_error.txtz=== full exception ===
z

=== type ===
z

=== status_code attr ===
status_codezn/az

=== body attr ===
body
r^   )encodingzElevenLabs auth/quota error:    409already_running429
rate_limittoo_many_requestsc                 3      | ]}| v V  qd S Nr   r,   r6   )msgr   r   	<genexpr>      z'synth_one_elevenlabs.<locals>.<genexpr>)z 500 z 502 z 503 z 504 c                 3  r   r   r   r   )lowr   r   r     r   )timeout
connectionzremote disconnectedr1   g       @g       @g      N@r4   g      ?5xxnet[tts] z
 on voice    u   …, retry /z in z.1fs)elevenlabs.clientr   rj   stemDEFAULT_EL_MODELosgetenvRuntimeErrorlowerr   ru   asyncioget_event_looprandomrangerun_in_executor	Exceptionr\   r   
write_texttyper   getattrr   anyrp   uniformr   infosleep)r(   r   r   r   r   r0   r2   r   r   r   loopr   r   max_retriesattemptr   is_quotais_409is_429is_5xxis_net	retryablerG   delaykindr   )	r   r   r   r   r   r   r   r   r   r   synth_one_elevenlabsA  s   /


"8r   ko-KR-SunHiNeuralvoicec                   sj   t jdd|d| dt|t jjt jjd	I dH }| I dH \}}|jdkr3td|	 dd	  dS )
zFree fallback via edge-tts CLI.zedge-ttsz--voicez--textz--write-media)stdoutstderrNr   zedge-tts failed: r   )
r   create_subprocess_execr\   r   DEVNULLPIPEcommunicate
returncoder   decode)r(   r   r   proc_errr   r   r   synth_one_edge  s   
r  
elevenlabs   texts	List[str]	out_paths
List[Path]backendconcurrencyemotionsvads
List[list]voice_seedsvoice_pools	voice_ids
List[bool]c	           
        s   t 
t ksJ t|dgt 
 dddd dkr$tnt	d 	
fdd	tjfd
dtt 
D  I dH  r\td dt 
 d  d rntd d dd  d S )zParallel TTS for a batch. Returns success flag per item.

    emotions/vads: per-item lists aligned with texts. If provided,
    passed into the TTS call for emotional prosody.
    Fr   r)   )flagreasonr  irc   c                   s  4 I d H   d r	 W d   I d H  d S |  }|  r;| jdkr;d| < d7 	 W d   I d H  d S z`t}dkrd urN|  |d< 
d urX
|  |d< d urh|  d urh|  |d< d urx|  d urx|  |d	< d ur|  d ur|  |d
< 	|  |fi |I d H  d| < W n@ ty } zd d< t| d< td|  W Y d }~n,d }~w ty } zt	d|  d|  W Y d }~nd }~ww W d   I d H  d S W d   I d H  d S W d   I d H  d S 1 I d H sw   Y  d S )Nr  i  Tr1   r  r0   r2   r   r   r   r  u*   [tts] QUOTA EXHAUSTED — aborting batch: z[tts] fail z: )
existsstatst_sizedictr   r\   r   r   r   r   )r  path
per_kwargsr   )abortedr	  r  kwargsokr  	semaphoreskippedsynth_fnr  r  r  r  r  r   r   one  sR   "0zsynth_all.<locals>.onec                   s   g | ]} |qS r   r   )r,   r  )r!  r   r   r.     s    zsynth_all.<locals>.<listcomp>Nr   r   z  turns skipped (already on disk)r  u<   
[tts] ⚠ batch aborted on quota/auth error.
      reason: r  r   uz   
      partial results saved. Re-run with a different ELEVENLABS_API_KEY to resume — already-done turns will be skipped.)r  rc   )r=   r   	Semaphorer   r  gatherr   print)
r  r  r	  r
  r  r  r  r  r  r  r   )r  r	  r  r  r  r!  r  r  r  r   r  r  r  r  r  r   	synth_all  s    

(&!
r%  )NNN)
r0   rW   rX   rW   rY   rW   rZ   rW   r[   r\   )r2   rk   )r(   r\   r[   r\   )r(   r\   r0   rW   r[   rz   )r   )r(   r\   r   rc   r[   r\   )r(   r\   r0   rW   r   r|   r[   r\   )NNNNNNN)r(   r\   r   r   r   r\   r   r\   r   r\   r0   r\   r2   r   r   r\   r   r\   )r   )r(   r\   r   r   r   r\   )r  r  NNNNN)r  r  r  r  r	  r\   r
  rc   r  r  r  r  r  r  r  r  r  r  r[   r  )$r
   
__future__r   r   rd   loggingr   pathlibr   typingr   	getLoggerr   r   r   r   r   rb   ra   FEMALE_BY_BASEMALE_BY_BASErJ   r<   r`   rj   ru   rx   
__import__compile_KOREAN_EMOTICON_REr   r   r   r   r  r%  r   r   r   r   <module>   s    

%	



 
4u