o
    ig                     @  s   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z d dlmZ d dlm
Z d dlmZ d d	lmZ d
Zg dZdZG dd dZdS )    )annotationsN)Path)List)	Tokenizer)	ByteLevel)BPE)TemplateProcessing)
BpeTrainerzklue/roberta-base)[PAD][UNK][CLS][SEP]i>  c                   @  st   e Zd Zd+dd	Zd,ddZd-ddZd.ddZe		d/d0d d!Zed1d2d"d#Z	ed3d%d&Z
ed4d(d)Zd*S )5MicroTokenizerpad_idintcls_idsep_id
vocab_sizemodestrc                 C  s(   || _ || _|| _|| _|| _|| _d S Nbackendr   r   r   r   r   )selfr   r   r   r   r   r    r   T/dataset/kemix-engine/package/face/animasync-face-v3/models/microalbert/tokenizer.py__init__   s   	
zMicroTokenizer.__init__returnc                 C  s   | j S r   )r   )r   r   r   r   __len__$   s   zMicroTokenizer.__len__textmax_len	List[int]c                 C  sH   | j dkr| j|dd|dd}t|d S | j|}t|jd | S )NhfTFadd_special_tokens
truncation
max_lengthreturn_attention_mask	input_ids)r   r   listencodeids)r   r   r    outencr   r   r   r*   '   s   
zMicroTokenizer.encodetexts	List[str]List[List[int]]c                   sN   | j dkr| j|dd dd}dd |d D S | j|} fdd|D S )	Nr"   TFr#   c                 S  s   g | ]}t |qS r   )r)   ).0r+   r   r   r   
<listcomp>=   s    z/MicroTokenizer.batch_encode.<locals>.<listcomp>r(   c                   s   g | ]}t |jd   qS r   )r)   r+   )r1   er    r   r   r2   ?   s    )r   r   encode_batch)r   r.   r    r,   	encodingsr   r4   r   batch_encode4   s   
zMicroTokenizer.batch_encodeTFsave_dirr   train_jsonl	prefer_hfbooladd_speaker_tokens'MicroTokenizer'c              
   C  s   t |}|jddd |d }| r5|s5z| |W S  ty4 } ztd|  W Y d }~nd }~ww |rZz| j||dW S  tyY } ztd| d W Y d }~nd }~ww | |||S )NT)parentsexist_oktokenizer.jsonz.[tokenizer] local tokenizer.json load failed: )r<   z[tokenizer] HF path failed: z; training fresh BPE on seed)r   mkdirexists	_load_bpe	Exceptionprint_load_hf
_train_bpe)clsr8   r9   r    r:   r<   existingr3   r   r   r   buildA   s$   	zMicroTokenizer.buildc                 C  s   ddl m} |t}|r|dddgi |ddd|dd	}t|d
 dks+J d|jd ur5|jd us7J | |t|jt|jt|j	t|ddS )Nr   )AutoTokenizeradditional_special_tokensz[SELF]z[OTHER]u   안녕하세요TFr#   r(      zHF tokenizer smoke test failedr"   r   )
transformersrK   from_pretrained	_HF_MODELr$   lenpad_token_idcls_token_idr   sep_token_id)rH   r    r<   rK   tokprober   r   r   rF   Z   s*   
zMicroTokenizer._load_hf
jsonl_pathc                 C  s$  g }t |jdd&}|D ]}t|}|d D ]}|d  }	|	r'||	 qqW d    n1 s3w   Y  ttdd}
tdd|
_	t
 |
_tttt d	}|
j||d
 |
d}|
d}|
d}tdd|fd|fgd|
_|
j|d |d }|
t| | |
||||
 ddS )Nzutf-8)encodingturnsr   r   )	unk_tokenF)add_prefix_space)r   special_tokensinitial_alphabet)trainerr   r   r
   z[CLS] $A [SEP])singler\   )r&   r@   bper   )r   openjsonloadsstripappendr   r   ByteLevelPreTokpre_tokenizerByteLevelDecoderdecoderr	   _BPE_VOCAB_SIZE_SPECIALalphabettrain_from_iteratortoken_to_idr   post_processorenable_truncationsaver   get_vocab_size)rH   rW   r8   r    r.   flinerowttxtrU   r^   r   r   r   out_pathr   r   r   rG   u   sN   




zMicroTokenizer._train_bpepathc                 C  s8   t t|}| ||d|d|d| ddS )Nr
   r   r   r`   r   )r   	from_filer   rn   rr   )rH   ry   rU   r   r   r   rC      s   zMicroTokenizer._load_bpeN)
r   r   r   r   r   r   r   r   r   r   )r   r   )r   r   r    r   r   r!   )r.   r/   r    r   r   r0   )TF)r8   r   r9   r   r    r   r:   r;   r<   r;   r   r=   )F)r    r   r<   r;   r   r=   )rW   r   r8   r   r    r   r   r=   )ry   r   r   r=   )__name__
__module____qualname__r   r   r*   r7   classmethodrJ   rF   rG   rC   r   r   r   r   r      s    



%r   )
__future__r   rb   pathlibr   typingr   
tokenizersr   tokenizers.decodersr   rh   tokenizers.modelsr   tokenizers.pre_tokenizersrf   tokenizers.processorsr   tokenizers.trainersr	   rP   rk   rj   r   r   r   r   r   <module>   s    