o
    i                     @  sj   d Z ddlmZ ddlZddlZddlmZ ddlZddlZ	ddl
Z
edZed d ZG dd	 d	ZdS )
u   Thin wrapper around LAM Audio2Expression for generating lipsync targets.

Model location: /dataset/text-to-face-se/LAM_Audio2Expression
Checkpoint:     checkpoints_황준희_lr5e-6/best.pt
    )annotationsN)Pathz-/dataset/text-to-face-se/LAM_Audio2Expressionu   checkpoints_황준희_lr5e-6zbest.ptc                   @  s<   e Zd ZedfdddZe ddddZddddZdS )
LAMWrapperNcheckpoint_pathr   devicestrc                 C  s@  t  stdt  | std| tjdtt  t | _	t
tt  zddlm} W t
| j	 nt
| j	 w |pKtj rJdnd| _t
tt  zD|ddd	d
dddddd	| j| _tj|| jdd}|d|d|}dd | D }| jj|dd | j  W t
| j	 d S t
| j	 w )NzLAM_DIR not found: zLAM checkpoint not found: r   )Audio2Expressioncudacpuwav2veczfacebook/wav2vec2-base-960hzconfigs/wav2vec2_config.json   @   i   4   lnF)	pretrained_encoder_typepretrained_encoder_pathwav2vec2_config_pathnum_identity_classesidentity_feat_dim
hidden_dimexpression_dim	norm_typeuse_transformer)map_locationweights_onlymodel_state_dict
state_dictc                 S  s&   i | ]\}}| d d dd|qS )zmodule. z	backbone.)replace).0kv r"   T/dataset/kemix-engine/package/face/animasync-face-v3/scripts/compiler/lam_wrapper.py
<dictcomp>6   s    z'LAMWrapper.__init__.<locals>.<dictcomp>)strict)LAM_DIRexistsFileNotFoundErrorsyspathinsertr   osgetcwd	_orig_cwdchdirmodels.networkr   torchr	   is_availabler   tomodelloadgetitemsload_state_dicteval)selfr   r   r   ckptr   new_state_dictr"   r"   r#   __init__   sD   

zLAMWrapper.__init__   
audio_pathfpsintreturn
np.ndarrayc                 C  s   ddl }tjt|ddd\}}t| d| j	}|
t|| | }|tdd| j	|d}| |}	|	d  tj}
|
S )	zpRun LAM on an audio file.

        Returns:
            (T, 52) float32 blendshapes at `fps` frame rate
        r   N>  T)srmono   r   )input_audio_arrayid_idx
time_steps)mathlibrosar5   r   r1   
from_numpyfloat	unsqueezer3   r   ceillenzerosr4   squeezer
   numpyastypenpfloat32)r:   r?   r@   rK   wavrE   
wav_tensor
num_frames
input_dictoutputbsr"   r"   r#   infer_audio?   s   
zLAMWrapper.infer_audiorD   rX   rE   c                 C  sb   ddl }ddl}|jddd}||j|| | t|jW  d   S 1 s*w   Y  dS )z5Run LAM on a raw audio array (already at 16kHz mono).r   Nz.wavF)suffixdelete)tempfile	soundfileNamedTemporaryFilewritenamer^   r   )r:   rX   rE   ra   sffr"   r"   r#   infer_wav_arrayT   s   $zLAMWrapper.infer_wav_array)r   r   r   r   )r>   )r?   r   r@   rA   rB   rC   )rD   )rX   rC   rE   rA   rB   rC   )	__name__
__module____qualname__LAM_CHECKPOINTr=   r1   no_gradr^   rh   r"   r"   r"   r#   r      s
    *r   )__doc__
__future__r   r,   r)   pathlibr   rL   rT   rV   r1   r&   rl   r   r"   r"   r"   r#   <module>   s    