import re from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors from tokenizers.normalizers import NFKC from transformers import PreTrainedTokenizerFast class ThaiTokenizerHF: def __init__(self, vocab_size=50000): self.vocab_size = vocab_size self.tokenizer = None def _create_tokenizer(self): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split(pattern=r'\n', behavior='isolated'), pre_tokenizers.Metaspace(replacement='▁'), ]) # Add Unicode normalization for multilingual support from tokenizers.normalizers import Sequence as NormalizerSequence from tokenizers.normalizers import StripAccents tokenizer.normalizer = NormalizerSequence([NFKC(), StripAccents()]) tokenizer.decoder = decoders.Metaspace(replacement='▁', add_prefix_space=False) return tokenizer def train(self, texts, save_path=None): if self.tokenizer is None: self.tokenizer = self._create_tokenizer() trainer = trainers.BpeTrainer( vocab_size=self.vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], show_progress=True, ) self.tokenizer.train_from_iterator(texts, trainer=trainer) if save_path: self.tokenizer.save(save_path) # Create HF compatible tokenizer hf_tokenizer = PreTrainedTokenizerFast( tokenizer_object=self.tokenizer, unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]", sep_token="[SEP]", mask_token="[MASK]", ) hf_tokenizer.save_pretrained(save_path + "_hf") return self.tokenizer def load(self, path): self.tokenizer = Tokenizer.from_file(path) return self.tokenizer def get_vocab_size(self): return self.tokenizer.get_vocab_size() if self.tokenizer else 0 def encode(self, text, add_special_tokens=False): if add_special_tokens: # Add CLS and SEP if requested tokens = self.tokenizer.encode(text) cls_id = self.tokenizer.token_to_id("[CLS]") sep_id = self.tokenizer.token_to_id("[SEP]") if cls_id is not None and sep_id is not None: tokens.ids = [cls_id] + tokens.ids + [sep_id] return tokens return self.tokenizer.encode(text) def decode(self, ids, skip_special_tokens=True): return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens) # Import here to avoid circular import from tokenizers import trainers