| import re
|
| from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
|
| from tokenizers.normalizers import NFKC
|
| from transformers import PreTrainedTokenizerFast
|
|
|
| class ThaiTokenizerHF:
|
| def __init__(self, vocab_size=50000):
|
| self.vocab_size = vocab_size
|
| self.tokenizer = None
|
|
|
| def _create_tokenizer(self):
|
| tokenizer = Tokenizer(models.BPE())
|
| tokenizer.normalizer = NFKC()
|
|
|
| tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
| pre_tokenizers.Split(pattern=r'\n', behavior='isolated'),
|
| pre_tokenizers.Metaspace(replacement='▁'),
|
| ])
|
|
|
|
|
| from tokenizers.normalizers import Sequence as NormalizerSequence
|
| from tokenizers.normalizers import StripAccents
|
| tokenizer.normalizer = NormalizerSequence([NFKC(), StripAccents()])
|
|
|
| tokenizer.decoder = decoders.Metaspace(replacement='▁', add_prefix_space=False)
|
| return tokenizer
|
|
|
| def train(self, texts, save_path=None):
|
| if self.tokenizer is None:
|
| self.tokenizer = self._create_tokenizer()
|
|
|
| trainer = trainers.BpeTrainer(
|
| vocab_size=self.vocab_size,
|
| special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
| show_progress=True,
|
| )
|
|
|
| self.tokenizer.train_from_iterator(texts, trainer=trainer)
|
|
|
| if save_path:
|
| self.tokenizer.save(save_path)
|
|
|
| hf_tokenizer = PreTrainedTokenizerFast(
|
| tokenizer_object=self.tokenizer,
|
| unk_token="[UNK]",
|
| pad_token="[PAD]",
|
| cls_token="[CLS]",
|
| sep_token="[SEP]",
|
| mask_token="[MASK]",
|
| )
|
| hf_tokenizer.save_pretrained(save_path + "_hf")
|
|
|
| return self.tokenizer
|
|
|
| def load(self, path):
|
| self.tokenizer = Tokenizer.from_file(path)
|
| return self.tokenizer
|
|
|
| def get_vocab_size(self):
|
| return self.tokenizer.get_vocab_size() if self.tokenizer else 0
|
|
|
| def encode(self, text, add_special_tokens=False):
|
| if add_special_tokens:
|
|
|
| tokens = self.tokenizer.encode(text)
|
| cls_id = self.tokenizer.token_to_id("[CLS]")
|
| sep_id = self.tokenizer.token_to_id("[SEP]")
|
| if cls_id is not None and sep_id is not None:
|
| tokens.ids = [cls_id] + tokens.ids + [sep_id]
|
| return tokens
|
| return self.tokenizer.encode(text)
|
|
|
| def decode(self, ids, skip_special_tokens=True):
|
| return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
|
|
|
|
| from tokenizers import trainers |