o
    j#                     @  s   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ G d	d
 d
ejZG dd dejZG dd dejZeee	eeB Zeee
ZG dd dejZdS )u  V3 face model — split-branch causal TCN with FiLM conditioning.

Architecture:

    audio (T, 80) ⊕ cond (T, 19)  →  Linear(99 → hidden)
                                      │
                                      ▼
            shared backbone: 6× DilatedCausalConv1d (d=1..32) + FiLM
                                      │
                       ┌──────────────┴──────────────┐
                       ▼                             ▼
       lipsync branch                       expression branch
       2× TCN (d=64,128)                    2× TCN (d=64,128)
                       ▼                             ▼
       Linear(hidden → 31)                  Linear(hidden → 21)
       sigmoid                              sigmoid
                       │                             │
                       └──────────┬──────────────────┘
                                  ▼
                       combined (T, 52) blendshape output
                       (lipsync values at LIPSYNC + SHARED indices,
                        expression values at EXPRESSION_ONLY indices)

Freezing:
    model.freeze_lipsync()  →  shared backbone + lipsync branch + lipsync
                                head are no_grad. Expression branch +
                                head remain trainable. Lipsync output
                                becomes bit-for-bit deterministic from
                                audio input.

~3.7 M params at hidden=192, ~1.2 ms/frame CPU.
Quantization-friendly (Conv1d + Linear + GELU + sigmoid, no attention).
    )annotationsN)nn)
functional)LIPSYNC_ONLYEXPRESSION_ONLYSHARED_CHANNELS   )V3FaceConfigc                      s,   e Zd ZdZd fddZdddZ  ZS )FiLMu   Per-frame Feature-wise Linear Modulation conditioning.

    Predicts (γ, β) from `cond` and applies `x * (1 + γ) + β`.
    cond_dimint
hidden_dimc                   s@   t    t|d| | _tj| jj tj| jj d S )N   )	super__init__r   Linearprojinitzeros_weightbias)selfr   r   	__class__ models/v3_face/model.pyr   5   s   
zFiLM.__init__xtorch.Tensorcondreturnc                 C  s,   |  |}|jddd\}}|d|  | S )Nr   dimg      ?)r   chunk)r   r   r   gbgammabetar   r   r   forward<   s   
zFiLM.forward)r   r   r   r   r   r   r   r   r   r   __name__
__module____qualname____doc__r   r'   __classcell__r   r   r   r   r
   /   s    r
   c                      s,   e Zd ZdZd fddZdddZ  ZS )CausalConv1dz6Left-padded 1D conv that never looks at future frames.channelsr   kernel_sizedilationc                   s2   t    |d | | _tj||||dd| _d S )Nr   r   )r2   padding)r   r   left_padr   Conv1dconv)r   r0   r1   r2   r   r   r   r   E   s
   

zCausalConv1d.__init__r   r   r   c                 C  s   t || jdf}| |S )Nr   )Fpadr4   r6   )r   r   r   r   r   r'   K   s   
zCausalConv1d.forward)r0   r   r1   r   r2   r   )r   r   r   r   r)   r   r   r   r   r/   B   s    r/   c                      s,   e Zd ZdZd fd	d
ZdddZ  ZS )TCNBlocku9   Residual block: 2× CausalConv1d + FiLM + GELU + dropout.r   r   r1   r2   r   dropoutfloatc                   sZ   t    t|||| _t|||| _t|| _t|| _t	||| _
t|| _d S N)r   r   r/   conv1conv2r   	LayerNormnorm1norm2r
   filmDropoutr:   )r   r   r1   r2   r   r:   r   r   r   r   S   s   
zTCNBlock.__init__r   r   r   r   c                 C  s   |}| dd}| |}| dd}| |}t|}| |}| dd}| |}| dd}| |}| ||}t|}| |}|| S )Nr   r   )		transposer=   r@   r7   gelur:   r>   rA   rB   )r   r   r   residualhr   r   r   r'   ]   s   







zTCNBlock.forward)
r   r   r1   r   r2   r   r   r   r:   r;   r(   r)   r   r   r   r   r9   P   s    
r9   c                      s`   e Zd ZdZd fddZdd
dZdddZedddZedddZ	edddZ
  ZS )V3FaceModelu?   Split-branch causal TCN: (audio, cond) → (T, 52) blendshapes.cfgr	   c                   s
  t     | _| dtjttjd | dtjttjd t	t}t	t}||  j
ks=J d| d| d j
 t j j  j| _t fdd jD | _t fd	d jD | _t j|| _t fd
d jD | _t j|| _d S )Nlipsync_idx)dtypeexpression_idxzchannel split mismatch: z + z != c                   $   g | ]}t  j j| j jqS r   r9   r   r1   r   r:   .0drI   r   r   
<listcomp>       z(V3FaceModel.__init__.<locals>.<listcomp>c                   rM   r   rN   rO   rR   r   r   rS      rT   c                   rM   r   rN   rO   rR   r   r   rS      rT   )r   r   rI   register_buffertorchtensorLIPSYNC_BRANCH_CHANNELSlongEXPRESSION_BRANCH_CHANNELSlen
output_dimr   r   	audio_dimr   r   
input_proj
ModuleListshared_dilationsshared_blocksbranch_dilationslipsync_blockslipsync_headexpression_blocksexpression_head)r   rI   	n_lipsyncn_expressionr   rR   r   r   |   s2   



zV3FaceModel.__init__audior   r   r   c                 C  s   t j||gdd}| |}| jD ]}|||}q|}| jD ]}|||}qt | |}|}| jD ]}|||}q3t | |}|j	d |j	d }	}
t j
|	|
| jj|j|jd}||d| jf< ||d| jf< |S )z
        Args:
            audio: (B, T, audio_dim)
            cond:  (B, T, cond_dim)
        Returns:
            blendshapes: (B, T, 52) in [0, 1]
        r    r!   r   r   )devicerK   .)rV   catr^   ra   rc   sigmoidrd   re   rf   shapezerosrI   r\   rj   rK   rJ   rL   )r   ri   r   r   blocklxlipsync_outexexpression_outBToutr   r   r   r'      s&   



zV3FaceModel.forwardr   c                 C  s   d}| j  D ]}d|_|| 7 }q| jD ]}| D ]}d|_|| 7 }qq| jD ]}| D ]}d|_|| 7 }q2q,| j D ]}d|_|| 7 }qD|S )u  Freeze shared backbone + lipsync branch + lipsync head.

        Sets `requires_grad=False` on every parameter that contributes to
        the lipsync output path. After this, lipsync output is bit-for-bit
        deterministic from audio + cond — the expression branch can be
        retrained without ANY drift in lipsync.

        Returns the number of frozen parameters.
        r   F)r^   
parametersrequires_gradnumelra   rc   rd   )r   frozenpro   r   r   r   freeze_lipsync   s$   


zV3FaceModel.freeze_lipsyncc                 C     t dd |  D S )Nc                 s  s    | ]}|  V  qd S r<   )ry   rP   r{   r   r   r   	<genexpr>   s    z'V3FaceModel.n_params.<locals>.<genexpr>sumrw   r   r   r   r   n_params      zV3FaceModel.n_paramsc                 C  r}   )Nc                 s  s    | ]
}|j r| V  qd S r<   )rx   ry   r~   r   r   r   r      s    z*V3FaceModel.n_trainable.<locals>.<genexpr>r   r   r   r   r   n_trainable   r   zV3FaceModel.n_trainabler;   c                 C  s   | j d d S )zFloat32 disk size in MB.   i   )r   r   r   r   r   size_mb   s   zV3FaceModel.size_mb)rI   r	   )ri   r   r   r   r   r   )r   r   )r   r;   )r*   r+   r,   r-   r   r'   r|   propertyr   r   r   r.   r   r   r   r   rH   y   s    
$
"rH   )r-   
__future__r   rV   r   torch.nnr   r7   scripts.compiler.constantsr   r   r   configr	   Moduler
   r/   r9   sortedsetrX   rZ   rH   r   r   r   r   <module>   s    !%