
    i                        d Z ddlmZ ddlZddlZddlmZ ddlZddlZ	ddl
Z
 ed          Zedz  dz  Z G d d	          ZdS )
u   Thin wrapper around LAM Audio2Expression for generating lipsync targets.

Model location: /dataset/text-to-face-se/LAM_Audio2Expression
Checkpoint:     checkpoints_황준희_lr5e-6/best.pt
    )annotationsN)Pathz-/dataset/text-to-face-se/LAM_Audio2Expressionu   checkpoints_황준희_lr5e-6zbest.ptc                  Z    e Zd ZedfddZ ej                    ddd            ZdddZdS )
LAMWrapperNcheckpoint_pathr   devicestrc                   t                                           st          dt                      |                                st          d|           t          j                            dt          t                                t          j                    | _	        t          j
        t          t                                	 ddlm} t          j
        | j	                   n# t          j
        | j	                   w xY w|p!t          j                                        rdnd| _        t          j
        t          t                                	  |ddd	d
ddddd	  	                            | j                  | _        t          j        || j        d          }|                    d|                    d|                    }d |                                D             }| j                            |d           | j                                         t          j
        | j	                   d S # t          j
        | j	                   w xY w)NzLAM_DIR not found: zLAM checkpoint not found: r   )Audio2Expressioncudacpuwav2veczfacebook/wav2vec2-base-960hzconfigs/wav2vec2_config.json   @   i   4   lnF)	pretrained_encoder_typepretrained_encoder_pathwav2vec2_config_pathnum_identity_classesidentity_feat_dim
hidden_dimexpression_dim	norm_typeuse_transformer)map_locationweights_onlymodel_state_dict
state_dictc                j    i | ]0\  }}|                     d d                               dd          |1S )zmodule. z	backbone.)replace).0kvs      T/dataset/kemix-engine/package/face/animasync-face-v3/scripts/compiler/lam_wrapper.py
<dictcomp>z'LAMWrapper.__init__.<locals>.<dictcomp>6   sL       Aq 		)R((00bAA1      )strict)LAM_DIRexistsFileNotFoundErrorsyspathinsertr	   osgetcwd	_orig_cwdchdirmodels.networkr   torchr   is_availabler   tomodelloadgetitemsload_state_dicteval)selfr   r   r   ckptr   new_state_dicts          r&   __init__zLAMWrapper.__init__   s'   ~~ 	E#$C'$C$CDDD%%'' 	T#$R$R$RSSS3w<<(((
W	%777777HT^$$$$BHT^$$$$P5:+B+B+D+D!O% 	W	%))(1(E%C%'"$! %
 
 
 boo J :oDKV[\\\D"4dhh|T6R6RSSJ &,,..  N J&&~e&DDDJOOHT^$$$$$BHT^$$$$s   C( (DC
H; ;I   
audio_pathfpsintreturn
np.ndarrayc                   ddl }t          j        t          |          dd          \  }}t	          j        |                                                              d                              | j	                  }|
                    t          |          |z  |z            }|t	          j        dd                              | j	                  |d}|                     |          }	|	                    d                                                                                              t$          j                  }
|
S )	zpRun LAM on an audio file.

        Returns:
            (T, 52) float32 blendshapes at `fps` frame rate
        r   N>  T)srmono   r   )input_audio_arrayid_idx
time_steps)mathlibrosar9   r	   r5   
from_numpyfloat	unsqueezer7   r   ceillenzerosr8   squeezer   numpyastypenpfloat32)r>   rC   rD   rP   wavrJ   
wav_tensor
num_frames
input_dictoutputbss              r&   infer_audiozLAMWrapper.infer_audio?   s    	,s:5tDDDR%c**0022<<Q??BB4;OO
YYs3xx"}s233
 ",k!R((++DK88$
 


 J''^^A""$$**,,33BJ??	r(   rI   r]   rJ   c                    ddl }ddl}|                    dd          5 }|                    |j        ||           |                     t          |j                            cddd           S # 1 swxY w Y   dS )z5Run LAM on a raw audio array (already at 16kHz mono).r   Nz.wavF)suffixdelete)tempfile	soundfileNamedTemporaryFilewritenamerc   r   )r>   r]   rJ   rg   sffs         r&   infer_wav_arrayzLAMWrapper.infer_wav_arrayT   s    ((u(EE 	2HHQVS"%%%##DLL11	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2s   AA00A47A4)r   r   r   r	   )rB   )rC   r   rD   rE   rF   rG   )rI   )r]   rG   rJ   rE   rF   rG   )	__name__
__module____qualname__LAM_CHECKPOINTrA   r5   no_gradrc   rn    r(   r&   r   r      sy        /=T (% (% (% (% (%T U]__    _(2 2 2 2 2 2 2r(   r   )__doc__
__future__r   r0   r-   pathlibr   rQ   rY   r[   r5   r*   rr   r   rt   r(   r&   <module>rx      s    
 # " " " " " 				 



            
$>
?
?99IEF2 F2 F2 F2 F2 F2 F2 F2 F2 F2r(   