AiDaeng-Thai-RoPE / hf_tokenizer.py
JonusNattapong's picture
Add missing source files for custom model loading
0bce4e1 verified
import re
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from tokenizers.normalizers import NFKC
from transformers import PreTrainedTokenizerFast
class ThaiTokenizerHF:
def __init__(self, vocab_size=50000):
self.vocab_size = vocab_size
self.tokenizer = None
def _create_tokenizer(self):
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
pre_tokenizers.Split(pattern=r'\n', behavior='isolated'),
pre_tokenizers.Metaspace(replacement='▁'),
])
# Add Unicode normalization for multilingual support
from tokenizers.normalizers import Sequence as NormalizerSequence
from tokenizers.normalizers import StripAccents
tokenizer.normalizer = NormalizerSequence([NFKC(), StripAccents()])
tokenizer.decoder = decoders.Metaspace(replacement='▁', add_prefix_space=False)
return tokenizer
def train(self, texts, save_path=None):
if self.tokenizer is None:
self.tokenizer = self._create_tokenizer()
trainer = trainers.BpeTrainer(
vocab_size=self.vocab_size,
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
show_progress=True,
)
self.tokenizer.train_from_iterator(texts, trainer=trainer)
if save_path:
self.tokenizer.save(save_path)
# Create HF compatible tokenizer
hf_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=self.tokenizer,
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
)
hf_tokenizer.save_pretrained(save_path + "_hf")
return self.tokenizer
def load(self, path):
self.tokenizer = Tokenizer.from_file(path)
return self.tokenizer
def get_vocab_size(self):
return self.tokenizer.get_vocab_size() if self.tokenizer else 0
def encode(self, text, add_special_tokens=False):
if add_special_tokens:
# Add CLS and SEP if requested
tokens = self.tokenizer.encode(text)
cls_id = self.tokenizer.token_to_id("[CLS]")
sep_id = self.tokenizer.token_to_id("[SEP]")
if cls_id is not None and sep_id is not None:
tokens.ids = [cls_id] + tokens.ids + [sep_id]
return tokens
return self.tokenizer.encode(text)
def decode(self, ids, skip_special_tokens=True):
return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
# Import here to avoid circular import
from tokenizers import trainers