| | from datasets import load_dataset |
| | from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer |
| | from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers |
| |
|
| | model_dir = "./" |
| |
|
| | |
| | dataset = load_dataset("mc4", "bn", split="train", streaming=True) |
| |
|
| | Instantiate tokenizer |
| | tokenizer = ByteLevelBPETokenizer() |
| |
|
| | |
| | tokenizer.normalizer = normalizers.Sequence( |
| | [ |
| | normalizers.Nmt(), |
| | normalizers.NFKC(), |
| | normalizers.Replace(Regex(" {2,}"), " "), |
| | normalizers.Replace("\u09e4", "\u0964"), |
| | normalizers.Replace("\u09e5", "\u0965"), |
| | normalizers.Replace("\u007c", "\u0964"), |
| | normalizers.Replace("\u09f7", "\u0964"), |
| | normalizers.Replace(Regex(r"(?<=[\u0980-\u09ff]):"), "\u0983"), |
| | normalizers.Lowercase(), |
| | ] |
| | ) |
| |
|
| | def batch_iterator(batch_size=1000): |
| | for i in range(0, len(dataset), batch_size): |
| | yield dataset[i: i + batch_size]["text"] |
| |
|
| | |
| | tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[ |
| | "<|endoftext|>", |
| | ]) |
| |
|
| | |
| | tokenizer.save(f"{model_dir}/tokenizer.json") |
| |
|
| | |
| | |
| | |
| |
|