Spaces:

tventurella
/

mr_chatterbox

Running

App Files Files Community

tventurella commited on Mar 24

Commit

59856b4

verified ·

1 Parent(s): 5a27571

Upload 17 files

Browse files

Files changed (17) hide show

Dockerfile +40 -0
README.md +5 -9
nanochat/__init__.py +0 -0
nanochat/checkpoint_manager.py +196 -0
nanochat/common.py +278 -0
nanochat/engine.py +360 -0
nanochat/flash_attention.py +187 -0
nanochat/gpt.py +465 -0
nanochat/logo.svg +8 -0
nanochat/optim.py +533 -0
nanochat/tokenizer.py +14 -0
nanochat/ui.html +566 -0
scripts/__init__.py +0 -0
scripts/chat_web.py +421 -0
start.sh +27 -0
tokenizer.json +0 -0
tokenizer_wrapper.py +282 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,40 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+RUN pip install --no-cache-dir \
+    torch --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir \
+    tokenizers \
+    fastapi \
+    uvicorn[standard] \
+    pydantic \
+    httpx \
+    filelock \
+    huggingface_hub
+# Copy application code
+COPY nanochat/ nanochat/
+COPY scripts/ scripts/
+COPY tokenizer_wrapper.py .
+COPY tokenizer.json .
+COPY start.sh .
+RUN chmod +x start.sh
+# Create model directory
+RUN mkdir -p /app/nanochat_cache/chatsft_checkpoints/d18
+# Set environment variables
+ENV NANOCHAT_BASE_DIR=/app/nanochat_cache
+ENV PYTHONPATH=/app
+# HuggingFace Spaces expects port 7860
+EXPOSE 7860
+CMD ["./start.sh"]

README.md CHANGED Viewed

@@ -1,12 +1,8 @@
 ---
-title: Mr Chatterbox
-emoji: ⚡
-colorFrom: purple
-colorTo: indigo
 sdk: docker
-pinned: false
-license: mit
-short_description: The Victorian Gentleman Chatbot
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Mr. Chatterbox
+emoji: 🎩
+colorFrom: red
+colorTo: yellow
 sdk: docker
+app_port: 7860
 ---

nanochat/__init__.py ADDED Viewed

File without changes

nanochat/checkpoint_manager.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+Utilities for saving and loading model/optim/state checkpoints.
+"""
+import os
+import re
+import glob
+import json
+import logging
+import torch
+from nanochat.common import get_base_dir
+from nanochat.gpt import GPT, GPTConfig
+from nanochat.tokenizer import get_tokenizer
+from nanochat.common import setup_default_logging
+# Set up logging
+setup_default_logging()
+logger = logging.getLogger(__name__)
+def log0(message):
+    if int(os.environ.get('RANK', 0)) == 0:
+        logger.info(message)
+def _patch_missing_config_keys(model_config_kwargs):
+    """Add default values for new config keys missing in old checkpoints."""
+    # Old models were trained with full context (no sliding window)
+    if "window_pattern" not in model_config_kwargs:
+        model_config_kwargs["window_pattern"] = "L"
+        log0(f"Patching missing window_pattern in model config to 'L'")
+def _patch_missing_keys(model_data, model_config):
+    """Add default values for new parameters that may be missing in old checkpoints."""
+    n_layer = model_config.n_layer
+    # resid_lambdas defaults to 1.0 (identity scaling)
+    if "resid_lambdas" not in model_data:
+        model_data["resid_lambdas"] = torch.ones(n_layer)
+        log0(f"Patching missing resid_lambdas in model data to 1.0")
+    # x0_lambdas defaults to 0.0 (disabled)
+    if "x0_lambdas" not in model_data:
+        model_data["x0_lambdas"] = torch.zeros(n_layer)
+        log0(f"Patching missing x0_lambdas in model data to 0.0")
+def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
+    if rank == 0:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        # Save the model state parameters
+        model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
+        torch.save(model_data, model_path)
+        logger.info(f"Saved model parameters to: {model_path}")
+        # Save the metadata dict as json
+        meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(meta_data, f, indent=2)
+        logger.info(f"Saved metadata to: {meta_path}")
+    # Note that optimizer state is sharded across ranks, so each rank must save its own.
+    if optimizer_data is not None:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
+        torch.save(optimizer_data, optimizer_path)
+        logger.info(f"Saved optimizer state to: {optimizer_path}")
+def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, rank=0):
+    # Load the model state
+    model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
+    model_data = torch.load(model_path, map_location=device)
+    # Load the optimizer state if requested
+    optimizer_data = None
+    if load_optimizer:
+        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
+        optimizer_data = torch.load(optimizer_path, map_location=device)
+    # Load the metadata
+    meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
+    with open(meta_path, "r", encoding="utf-8") as f:
+        meta_data = json.load(f)
+    return model_data, optimizer_data, meta_data
+def build_model(checkpoint_dir, step, device, phase):
+    """
+    A bunch of repetitive code to build a model from a given checkpoint.
+    Returns:
+    - base model - uncompiled, not wrapped in DDP
+    - tokenizer
+    - meta data saved during base model training
+    """
+    assert phase in ["train", "eval"], f"Invalid phase: {phase}"
+    model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
+    if device.type in {"cpu", "mps"}:
+        # Convert bfloat16 tensors to float for CPU inference
+        model_data = {
+            k: v.float() if v.dtype == torch.bfloat16 else v
+            for k, v in model_data.items()
+        }
+    # Hack: fix torch compile issue, which prepends all keys with _orig_mod.
+    model_data = {k.removeprefix("_orig_mod."): v for k, v in model_data.items()}
+    model_config_kwargs = meta_data["model_config"]
+    _patch_missing_config_keys(model_config_kwargs)
+    log0(f"Building model with config: {model_config_kwargs}")
+    model_config = GPTConfig(**model_config_kwargs)
+    _patch_missing_keys(model_data, model_config)
+    with torch.device("meta"):
+        model = GPT(model_config)
+    # Load the model state
+    model.to_empty(device=device)
+    model.init_weights() # note: this is dumb, but we need to init the rotary embeddings. TODO: fix model re-init
+    result = model.load_state_dict(model_data, strict=False, assign=True)
+    if result.unexpected_keys:
+        log0(f"Ignoring unexpected checkpoint keys: {result.unexpected_keys}")
+    # Put the model in the right training phase / mode
+    if phase == "eval":
+        model.eval()
+    else:
+        model.train()
+    # Load the Tokenizer
+    tokenizer = get_tokenizer()
+    # Sanity check: compatibility between model and tokenizer
+    assert tokenizer.get_vocab_size() == model_config_kwargs["vocab_size"], f"Tokenizer vocab size {tokenizer.get_vocab_size()} does not match model config vocab size {model_config_kwargs['vocab_size']}"
+    return model, tokenizer, meta_data
+def find_largest_model(checkpoints_dir):
+    # attempt to guess the model tag: take the biggest model available
+    model_tags = [f for f in os.listdir(checkpoints_dir) if os.path.isdir(os.path.join(checkpoints_dir, f))]
+    if not model_tags:
+        raise FileNotFoundError(f"No checkpoints found in {checkpoints_dir}")
+    # 1) normally all model tags are of the form d<number>, try that first:
+    candidates = []
+    for model_tag in model_tags:
+        match = re.match(r"d(\d+)", model_tag)
+        if match:
+            model_depth = int(match.group(1))
+            candidates.append((model_depth, model_tag))
+    if candidates:
+        candidates.sort(key=lambda x: x[0], reverse=True)
+        return candidates[0][1]
+    # 2) if that failed, take the most recently updated model:
+    model_tags.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoints_dir, x)), reverse=True)
+    return model_tags[0]
+def find_last_step(checkpoint_dir):
+    # Look into checkpoint_dir and find model_<step>.pt with the highest step
+    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "model_*.pt"))
+    if not checkpoint_files:
+        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
+    last_step = int(max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files))
+    return last_step
+# -----------------------------------------------------------------------------
+# convenience functions that take into account nanochat's directory structure
+def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=None):
+    if model_tag is None:
+        # guess the model tag by defaulting to the largest model
+        model_tag = find_largest_model(checkpoints_dir)
+        log0(f"No model tag provided, guessing model tag: {model_tag}")
+    checkpoint_dir = os.path.join(checkpoints_dir, model_tag)
+    if step is None:
+        # guess the step by defaulting to the last step
+        step = find_last_step(checkpoint_dir)
+    assert step is not None, f"No checkpoints found in {checkpoint_dir}"
+    # build the model
+    log0(f"Loading model from {checkpoint_dir} with step {step}")
+    model, tokenizer, meta_data = build_model(checkpoint_dir, step, device, phase)
+    return model, tokenizer, meta_data
+def load_model(source, *args, **kwargs):
+    model_dir = {
+        "base": "base_checkpoints",
+        "sft": "chatsft_checkpoints",
+        "rl": "chatrl_checkpoints",
+    }[source]
+    base_dir = get_base_dir()
+    checkpoints_dir = os.path.join(base_dir, model_dir)
+    return load_model_from_dir(checkpoints_dir, *args, **kwargs)
+def load_optimizer_state(source, device, rank, model_tag=None, step=None):
+    """Load just the optimizer shard for a given rank, without re-loading the model."""
+    model_dir = {
+        "base": "base_checkpoints",
+        "sft": "chatsft_checkpoints",
+        "rl": "chatrl_checkpoints",
+    }[source]
+    base_dir = get_base_dir()
+    checkpoints_dir = os.path.join(base_dir, model_dir)
+    if model_tag is None:
+        model_tag = find_largest_model(checkpoints_dir)
+    checkpoint_dir = os.path.join(checkpoints_dir, model_tag)
+    if step is None:
+        step = find_last_step(checkpoint_dir)
+    optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
+    if not os.path.exists(optimizer_path):
+        log0(f"Optimizer checkpoint not found: {optimizer_path}")
+        return None
+    log0(f"Loading optimizer state from {optimizer_path}")
+    optimizer_data = torch.load(optimizer_path, map_location=device)
+    return optimizer_data

nanochat/common.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+Common utilities for nanochat.
+"""
+import os
+import re
+import logging
+import urllib.request
+import torch
+import torch.distributed as dist
+from filelock import FileLock
+# The dtype used for compute (matmuls, activations). Master weights stay fp32 for optimizer precision.
+# Linear layers cast their weights to this dtype in forward, replacing torch.amp.autocast.
+# Override with NANOCHAT_DTYPE env var: "bfloat16", "float16", "float32"
+_DTYPE_MAP = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}
+def _detect_compute_dtype():
+    env = os.environ.get("NANOCHAT_DTYPE")
+    if env is not None:
+        return _DTYPE_MAP[env], f"set via NANOCHAT_DTYPE={env}"
+    if torch.cuda.is_available():
+        # bf16 requires SM 80+ (Ampere: A100, A10, etc.)
+        # Older GPUs like V100 (SM 70) and T4 (SM 75) only have fp16 tensor cores
+        capability = torch.cuda.get_device_capability()
+        if capability >= (8, 0):
+            return torch.bfloat16, f"auto-detected: CUDA SM {capability[0]}{capability[1]} (bf16 supported)"
+        # fp16 training requires GradScaler (not yet implemented), so fall back to fp32.
+        # Users can still force fp16 via NANOCHAT_DTYPE=float16 if they know what they're doing.
+        return torch.float32, f"auto-detected: CUDA SM {capability[0]}{capability[1]} (pre-Ampere, bf16 not supported, using fp32)"
+    return torch.float32, "auto-detected: no CUDA (CPU/MPS)"
+COMPUTE_DTYPE, COMPUTE_DTYPE_REASON = _detect_compute_dtype()
+class ColoredFormatter(logging.Formatter):
+    """Custom formatter that adds colors to log messages."""
+    # ANSI color codes
+    COLORS = {
+        'DEBUG': '\033[36m',    # Cyan
+        'INFO': '\033[32m',     # Green
+        'WARNING': '\033[33m',  # Yellow
+        'ERROR': '\033[31m',    # Red
+        'CRITICAL': '\033[35m', # Magenta
+    }
+    RESET = '\033[0m'
+    BOLD = '\033[1m'
+    def format(self, record):
+        # Add color to the level name
+        levelname = record.levelname
+        if levelname in self.COLORS:
+            record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
+        # Format the message
+        message = super().format(record)
+        # Add color to specific parts of the message
+        if levelname == 'INFO':
+            # Highlight numbers and percentages
+            message = re.sub(r'(\d+\.?\d*\s*(?:GB|MB|%|docs))', rf'{self.BOLD}\1{self.RESET}', message)
+            message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)
+        return message
+def setup_default_logging():
+    handler = logging.StreamHandler()
+    handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[handler]
+    )
+setup_default_logging()
+logger = logging.getLogger(__name__)
+def get_base_dir():
+    # co-locate nanochat intermediates with other cached data in ~/.cache (by default)
+    if os.environ.get("NANOCHAT_BASE_DIR"):
+        nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR")
+    else:
+        home_dir = os.path.expanduser("~")
+        cache_dir = os.path.join(home_dir, ".cache")
+        nanochat_dir = os.path.join(cache_dir, "nanochat")
+    os.makedirs(nanochat_dir, exist_ok=True)
+    return nanochat_dir
+def download_file_with_lock(url, filename, postprocess_fn=None):
+    """
+    Downloads a file from a URL to a local path in the base directory.
+    Uses a lock file to prevent concurrent downloads among multiple ranks.
+    """
+    base_dir = get_base_dir()
+    file_path = os.path.join(base_dir, filename)
+    lock_path = file_path + ".lock"
+    if os.path.exists(file_path):
+        return file_path
+    with FileLock(lock_path):
+        # Only a single rank can acquire this lock
+        # All other ranks block until it is released
+        # Recheck after acquiring lock
+        if os.path.exists(file_path):
+            return file_path
+        # Download the content as bytes
+        print(f"Downloading {url}...")
+        with urllib.request.urlopen(url) as response:
+            content = response.read() # bytes
+        # Write to local file
+        with open(file_path, 'wb') as f:
+            f.write(content)
+        print(f"Downloaded to {file_path}")
+        # Run the postprocess function if provided
+        if postprocess_fn is not None:
+            postprocess_fn(file_path)
+    return file_path
+def print0(s="",**kwargs):
+    ddp_rank = int(os.environ.get('RANK', 0))
+    if ddp_rank == 0:
+        print(s, **kwargs)
+def print_banner():
+    # Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/
+    banner = """
+                                                       █████                █████
+                                                      ░░███                ░░███
+     ████████    ██████   ██��█████    ██████   ██████  ░███████    ██████  ███████
+    ░░███░░███  ░░░░░███ ░░███░░███  ███░░███ ███░░███ ░███░░███  ░░░░░███░░░███░
+     ░███ ░███   ███████  ░███ ░███ ░███ ░███░███ ░░░  ░███ ░███   ███████  ░███
+     ░███ ░███  ███░░███  ░███ ░███ ░███ ░███░███  ███ ░███ ░███  ███░░███  ░███ ███
+     ████ █████░░████████ ████ █████░░██████ ░░██████  ████ █████░░███████  ░░█████
+    ░░░░ ░░░░░  ░░░░░░░░ ░░░░ ░░░░░  ░░░░░░   ░░░░░░  ░░░░ ░░░░░  ░░░░░░░░   ░░░░░
+    """
+    print0(banner)
+def is_ddp_requested() -> bool:
+    """
+    True if launched by torchrun (env present), even before init.
+    Used to decide whether we *should* initialize a PG.
+    """
+    return all(k in os.environ for k in ("RANK", "LOCAL_RANK", "WORLD_SIZE"))
+def is_ddp_initialized() -> bool:
+    """
+    True if torch.distributed is available and the process group is initialized.
+    Used at cleanup to avoid destroying a non-existent PG.
+    """
+    return dist.is_available() and dist.is_initialized()
+def get_dist_info():
+    if is_ddp_requested():
+        # We rely on torchrun's env to decide if we SHOULD init.
+        # (Initialization itself happens in compute init.)
+        assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])
+        ddp_rank = int(os.environ['RANK'])
+        ddp_local_rank = int(os.environ['LOCAL_RANK'])
+        ddp_world_size = int(os.environ['WORLD_SIZE'])
+        return True, ddp_rank, ddp_local_rank, ddp_world_size
+    else:
+        return False, 0, 0, 1
+def autodetect_device_type():
+    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
+    if torch.cuda.is_available():
+        device_type = "cuda"
+    elif torch.backends.mps.is_available():
+        device_type = "mps"
+    else:
+        device_type = "cpu"
+    print0(f"Autodetected device type: {device_type}")
+    return device_type
+def compute_init(device_type="cuda"): # cuda|cpu|mps
+    """Basic initialization that we keep doing over and over, so make common."""
+    assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"
+    if device_type == "cuda":
+        assert torch.cuda.is_available(), "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"
+    if device_type == "mps":
+        assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
+    # Reproducibility
+    # Note that we set the global seeds here, but most of the code uses explicit rng objects.
+    # The only place where global rng might be used is nn.Module initialization of the model weights.
+    torch.manual_seed(42)
+    if device_type == "cuda":
+        torch.cuda.manual_seed(42)
+    # skipping full reproducibility for now, possibly investigate slowdown later
+    # torch.use_deterministic_algorithms(True)
+    # Precision
+    if device_type == "cuda":
+        torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls, see https://docs.pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html
+    # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
+    is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    if is_ddp_requested and device_type == "cuda":
+        device = torch.device("cuda", ddp_local_rank)
+        torch.cuda.set_device(device)  # make "cuda" default to this device
+        dist.init_process_group(backend="nccl", device_id=device)
+        dist.barrier()
+    else:
+        device = torch.device(device_type) # mps|cpu
+    if ddp_rank == 0:
+        logger.info(f"Distributed world size: {ddp_world_size}")
+    return is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size, device
+def compute_cleanup():
+    """Companion function to compute_init, to clean things up before script exit"""
+    if is_ddp_initialized():
+        dist.destroy_process_group()
+class DummyWandb:
+    """Useful if we wish to not use wandb but have all the same signatures"""
+    def __init__(self):
+        pass
+    def log(self, *args, **kwargs):
+        pass
+    def finish(self):
+        pass
+# hardcoded BF16 peak flops for various GPUs
+# inspired by torchtitan: https://github.com/pytorch/torchtitan/blob/main/torchtitan/tools/utils.py
+# and PR: https://github.com/karpathy/nanochat/pull/147
+def get_peak_flops(device_name: str) -> float:
+    name = device_name.lower()
+    # Table order matters: more specific patterns first.
+    _PEAK_FLOPS_TABLE = (
+        # NVIDIA Blackwell
+        (["gb200"], 2.5e15),
+        (["grace blackwell"], 2.5e15),
+        (["b200"], 2.25e15),
+        (["b100"], 1.8e15),
+        # NVIDIA Hopper
+        (["h200", "nvl"], 836e12),
+        (["h200", "pcie"], 836e12),
+        (["h200"], 989e12),
+        (["h100", "nvl"], 835e12),
+        (["h100", "pcie"], 756e12),
+        (["h100"], 989e12),
+        (["h800", "nvl"], 989e12),
+        (["h800"], 756e12),
+        # NVIDIA Ampere data center
+        (["a100"], 312e12),
+        (["a800"], 312e12),
+        (["a40"], 149.7e12),
+        (["a30"], 165e12),
+        # NVIDIA Ada data center
+        (["l40s"], 362e12),
+        (["l40-s"], 362e12),
+        (["l40 s"], 362e12),
+        (["l4"], 121e12),
+        # AMD CDNA accelerators
+        (["mi355"], 2.5e15),
+        (["mi325"], 1.3074e15),
+        (["mi300x"], 1.3074e15),
+        (["mi300a"], 980.6e12),
+        (["mi250x"], 383e12),
+        (["mi250"], 362.1e12),
+        # Consumer RTX
+        (["5090"], 209.5e12),
+        (["4090"], 165.2e12),
+        (["3090"], 71e12),
+    )
+    for patterns, flops in _PEAK_FLOPS_TABLE:
+        if all(p in name for p in patterns):
+            return flops
+    if "data center gpu max 1550" in name:
+        # Ponte Vecchio (PVC) - dynamic based on compute units
+        max_comp_units = torch.xpu.get_device_properties("xpu").max_compute_units
+        return 512 * max_comp_units * 1300 * 10**6
+    # Unknown GPU - return inf so MFU shows as 0% rather than a wrong guess
+    logger.warning(f"Peak flops undefined for: {device_name}, MFU will show as 0%")
+    return float('inf')

nanochat/engine.py ADDED Viewed

	@@ -0,0 +1,360 @@

+"""
+Engine for efficient inference of our models.
+Everything works around token sequences:
+- The user can send token sequences to the engine
+- The engine returns the next token
+Notes:
+- The engine knows nothing about tokenization, it's purely token id sequences.
+The whole thing is made as efficient as possible.
+"""
+import torch
+import torch.nn.functional as F
+import signal
+import warnings
+from contextlib import contextmanager
+from collections import deque
+from nanochat.common import compute_init, autodetect_device_type
+from nanochat.checkpoint_manager import load_model
+# -----------------------------------------------------------------------------
+# Calculator tool helpers
+@contextmanager
+def timeout(duration, formula):
+    def timeout_handler(signum, frame):
+        raise Exception(f"'{formula}': timed out after {duration} seconds")
+    signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(duration)
+    yield
+    signal.alarm(0)
+def eval_with_timeout(formula, max_time=3):
+    try:
+        with timeout(max_time, formula):
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", SyntaxWarning)
+                return eval(formula, {"__builtins__": {}}, {})
+    except Exception as e:
+        signal.alarm(0)
+        # print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage
+        return None
+def use_calculator(expr):
+    """
+    Evaluate a Python expression safely.
+    Supports both math expressions and string operations like .count()
+    """
+    # Remove commas from numbers
+    expr = expr.replace(",", "")
+    # Check if it's a pure math expression (old behavior)
+    if all([x in "0123456789*+-/.() " for x in expr]):
+        if "**" in expr:  # disallow power operator
+            return None
+        return eval_with_timeout(expr)
+    # Check if it's a string operation we support
+    # Allow: strings (single/double quotes), .count(), letters, numbers, spaces, parens
+    allowed_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'\"()._ "
+    if not all([x in allowed_chars for x in expr]):
+        return None
+    # Disallow dangerous patterns
+    dangerous_patterns = ['__', 'import', 'exec', 'eval', 'compile', 'open', 'file',
+                         'input', 'raw_input', 'globals', 'locals', 'vars', 'dir',
+                         'getattr', 'setattr', 'delattr', 'hasattr']
+    expr_lower = expr.lower()
+    if any(pattern in expr_lower for pattern in dangerous_patterns):
+        return None
+    # Only allow .count() method for now (can expand later)
+    if '.count(' not in expr:
+        return None
+    # Evaluate with timeout
+    return eval_with_timeout(expr)
+# -----------------------------------------------------------------------------
+class KVCache:
+    """
+    KV Cache designed for Flash Attention 3's flash_attn_with_kvcache API.
+    Key differences from FA2-style cache:
+    - Tensors are (B, T, H, D) not (B, H, T, D)
+    - FA3 updates the cache in-place during flash_attn_with_kvcache
+    - Position tracked per batch element via cache_seqlens tensor
+    """
+    def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layers, device, dtype):
+        self.batch_size = batch_size
+        self.max_seq_len = seq_len
+        self.n_layers = num_layers
+        self.n_heads = num_heads
+        self.head_dim = head_dim
+        # Pre-allocate cache tensors: (n_layers, B, T, H, D)
+        self.k_cache = torch.zeros(num_layers, batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype)
+        self.v_cache = torch.zeros(num_layers, batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype)
+        # Current sequence length per batch element (FA3 needs int32)
+        self.cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+    def reset(self):
+        """Reset cache to empty state."""
+        self.cache_seqlens.zero_()
+    def get_pos(self):
+        """Get current position (assumes all batch elements at same position)."""
+        return self.cache_seqlens[0].item()
+    def get_layer_cache(self, layer_idx):
+        """Return (k_cache, v_cache) views for a specific layer."""
+        return self.k_cache[layer_idx], self.v_cache[layer_idx]
+    def advance(self, num_tokens):
+        """Advance the cache position by num_tokens."""
+        self.cache_seqlens += num_tokens
+    def prefill(self, other):
+        """
+        Copy cached KV from another cache into this one.
+        Used when we do batch=1 prefill and then want to generate multiple samples in parallel.
+        """
+        assert self.get_pos() == 0, "Cannot prefill a non-empty KV cache"
+        assert self.n_layers == other.n_layers and self.n_heads == other.n_heads and self.head_dim == other.head_dim
+        assert self.max_seq_len >= other.max_seq_len
+        other_pos = other.get_pos()
+        self.k_cache[:, :, :other_pos, :, :] = other.k_cache[:, :, :other_pos, :, :]
+        self.v_cache[:, :, :other_pos, :, :] = other.v_cache[:, :, :other_pos, :, :]
+        self.cache_seqlens.fill_(other_pos)
+# -----------------------------------------------------------------------------
+@torch.inference_mode()
+def sample_next_token(logits, rng, temperature=1.0, top_k=None):
+    """Sample a single next token from given logits of shape (B, vocab_size). Returns (B, 1)."""
+    assert temperature >= 0.0, "temperature must be non-negative"
+    if temperature == 0.0:
+        return torch.argmax(logits, dim=-1, keepdim=True)
+    if top_k is not None and top_k > 0:
+        k = min(top_k, logits.size(-1))
+        vals, idx = torch.topk(logits, k, dim=-1)
+        vals = vals / temperature
+        probs = F.softmax(vals, dim=-1)
+        choice = torch.multinomial(probs, num_samples=1, generator=rng)
+        return idx.gather(1, choice)
+    else:
+        logits = logits / temperature
+        probs = F.softmax(logits, dim=-1)
+        return torch.multinomial(probs, num_samples=1, generator=rng)
+# -----------------------------------------------------------------------------
+class RowState:
+    # Per-row state tracking during generation
+    def __init__(self, current_tokens=None):
+        self.current_tokens = current_tokens or [] # Current token sequence for this row
+        self.forced_tokens = deque() # Queue of tokens to force inject
+        self.in_python_block = False # Whether we are inside a python block
+        self.python_expr_tokens = [] # Tokens of the current python expression
+        self.completed = False # Whether this row has completed generation
+class Engine:
+    def __init__(self, model, tokenizer):
+        self.model = model
+        self.tokenizer = tokenizer # needed for tool use
+    @torch.inference_mode()
+    def generate(self, tokens, num_samples=1, max_tokens=None, temperature=1.0, top_k=None, seed=42, repetition_penalty=1.0, repetition_window=64):
+        """Same as generate, but does single prefill and then clones the KV cache."""
+        assert isinstance(tokens, list) and isinstance(tokens[0], int), "expecting list of ints"
+        device = self.model.get_device()
+        # NOTE: setting the dtype here and in this way is an ugly hack.
+        # Currently the repo assumes that cuda -> bfloat16 and everything else -> float32.
+        # We need to know the dtype here to call __init__ on KVCache and pre-allocate its tensors.
+        # As a quick hack, we're making generate() function inherit and know about this repo-wise assumption.
+        # I think there has to be a bigger refactor to deal with device/dtype tracking across the codebase.
+        # In particular, the KVCache should allocate its tensors lazily
+        dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
+        rng = torch.Generator(device=device)
+        rng.manual_seed(seed)
+        # Get the special tokens we need to coordinate the tool use state machine
+        get_special = lambda s: self.tokenizer.encode_special(s)
+        python_start = get_special("<|python_start|>")
+        python_end = get_special("<|python_end|>")
+        output_start = get_special("<|output_start|>")
+        output_end = get_special("<|output_end|>")
+        assistant_end = get_special("<|assistant_end|>") # if sampled, ends row
+        bos = self.tokenizer.get_bos_token_id() # if sampled, ends row
+        # 1) Run a batch 1 prefill of the prompt tokens
+        m = self.model.config
+        kv_model_kwargs = {"num_heads": m.n_kv_head, "head_dim": m.n_embd // m.n_head, "num_layers": m.n_layer}
+        kv_cache_prefill = KVCache(
+            batch_size=1,
+            seq_len=len(tokens),
+            device=device,
+            dtype=dtype,
+            **kv_model_kwargs,
+        )
+        ids = torch.tensor([tokens], dtype=torch.long, device=device)
+        logits = self.model.forward(ids, kv_cache=kv_cache_prefill)
+        logits = logits[:, -1, :].expand(num_samples, -1)  # (num_samples, vocab_size)
+        # 2) Replicate the KV cache for each sample/row
+        kv_length_hint = (len(tokens) + max_tokens) if max_tokens is not None else self.model.config.sequence_len
+        kv_cache_decode = KVCache(
+            batch_size=num_samples,
+            seq_len=kv_length_hint,
+            device=device,
+            dtype=dtype,
+            **kv_model_kwargs,
+        )
+        kv_cache_decode.prefill(kv_cache_prefill)
+        del kv_cache_prefill # no need to keep this memory around
+        # 3) Initialize states for each sample
+        row_states = [RowState(tokens.copy()) for _ in range(num_samples)]
+        # 4) Main generation loop
+        num_generated = 0
+        while True:
+            # Stop condition: we've reached max tokens
+            if max_tokens is not None and num_generated >= max_tokens:
+                break
+            # Stop condition: all rows are completed
+            if all(state.completed for state in row_states):
+                break
+            # Sample the next token for each row
+            if repetition_penalty != 1.0:  # Victorian repetition penalty patch
+                _pen = logits.clone()
+                for _i, _s in enumerate(row_states):
+                    if not _s.completed:
+                        for _t in set(_s.current_tokens[-repetition_window:]):
+                            _pen[_i, _t] = (_pen[_i, _t] / repetition_penalty
+                                            if _pen[_i, _t] > 0 else _pen[_i, _t] * repetition_penalty)
+                next_ids = sample_next_token(_pen, rng, temperature, top_k)
+            else:
+                next_ids = sample_next_token(logits, rng, temperature, top_k)  # (B, 1)
+            sampled_tokens = next_ids[:, 0].tolist()
+            # Process each row: choose the next token, update state, optional tool use
+            token_column = [] # contains the next token id along each row
+            token_masks = [] # contains the mask (was it sampled (1) or forced (0)?) along each row
+            for i, state in enumerate(row_states):
+                # Select the next token in this row
+                is_forced = len(state.forced_tokens) > 0 # are there tokens waiting to be forced in deque?
+                token_masks.append(0 if is_forced else 1) # mask is 0 if forced, 1 if sampled
+                next_token = state.forced_tokens.popleft() if is_forced else sampled_tokens[i]
+                token_column.append(next_token)
+                # Update the state of this row to include the next token
+                state.current_tokens.append(next_token)
+                # On <|assistant_end|> or <|bos|>, mark the row as completed
+                if next_token == assistant_end or next_token == bos:
+                    state.completed = True
+                # Handle tool logic
+                if next_token == python_start:
+                    state.in_python_block = True
+                    state.python_expr_tokens = []
+                elif next_token == python_end and state.in_python_block:
+                    state.in_python_block = False
+                    if state.python_expr_tokens:
+                        expr = self.tokenizer.decode(state.python_expr_tokens)
+                        result = use_calculator(expr)
+                        if result is not None:
+                            result_tokens = self.tokenizer.encode(str(result))
+                            state.forced_tokens.append(output_start)
+                            state.forced_tokens.extend(result_tokens)
+                            state.forced_tokens.append(output_end)
+                    state.python_expr_tokens = []
+                elif state.in_python_block:
+                    state.python_expr_tokens.append(next_token)
+            # Yield the token column
+            yield token_column, token_masks
+            num_generated += 1
+            # Prepare logits for next iteration
+            ids = torch.tensor(token_column, dtype=torch.long, device=device).unsqueeze(1)
+            logits = self.model.forward(ids, kv_cache=kv_cache_decode)[:, -1, :]  # (B, vocab_size)
+    def generate_batch(self, tokens, num_samples=1, **kwargs):
+        """
+        Non-streaming batch generation that just returns the final token sequences.
+        Returns a list of token sequences (list of lists of ints).
+        Terminal tokens (assistant_end, bos) are not included in the results.
+        """
+        assistant_end = self.tokenizer.encode_special("<|assistant_end|>")
+        bos = self.tokenizer.get_bos_token_id()
+        results = [tokens.copy() for _ in range(num_samples)]
+        masks = [[0] * len(tokens) for _ in range(num_samples)]
+        completed = [False] * num_samples
+        for token_column, token_masks in self.generate(tokens, num_samples, **kwargs):
+            for i, (token, mask) in enumerate(zip(token_column, token_masks)):
+                if not completed[i]:
+                    if token == assistant_end or token == bos:
+                        completed[i] = True
+                    else:
+                        results[i].append(token)
+                        masks[i].append(mask)
+            # Stop if all rows are completed
+            if all(completed):
+                break
+        return results, masks
+if __name__ == "__main__":
+    """
+    Quick inline test to make sure that the naive/slow model.generate function
+    is equivalent to the faster Engine.generate function here.
+    """
+    import time
+    # init compute
+    device_type = autodetect_device_type()
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
+    # load the model and tokenizer
+    model, tokenizer, meta = load_model("base", device, phase="eval")
+    bos_token_id = tokenizer.get_bos_token_id()
+    # common hyperparameters
+    kwargs = dict(max_tokens=64, temperature=0.0)
+    # set the starting prompt
+    prompt_tokens = tokenizer.encode("The chemical formula of water is", prepend=bos_token_id)
+    # generate the reference sequence using the model.generate() function
+    generated_tokens = []
+    torch.cuda.synchronize()
+    t0 = time.time()
+    stream = model.generate(prompt_tokens, **kwargs)
+    for token in stream:
+        generated_tokens.append(token)
+        chunk = tokenizer.decode([token])
+        print(chunk, end="", flush=True)
+    print()
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"Reference time: {t1 - t0:.2f}s")
+    reference_ids = generated_tokens
+    # generate tokens with Engine
+    generated_tokens = []
+    engine = Engine(model, tokenizer)
+    stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for token_column, token_masks in stream:
+        token = token_column[0] # only print out the first row
+        generated_tokens.append(token)
+        chunk = tokenizer.decode([token])
+        print(chunk, end="", flush=True)
+    print()
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"Engine time: {t1 - t0:.2f}s")
+    # compare the two sequences
+    for i in range(len(reference_ids)):
+        if reference_ids[i] != generated_tokens[i]:
+            print(f"Mismatch at {i}: {reference_ids[i]} != {generated_tokens[i]}")
+            break
+    print(f"Match: {reference_ids == generated_tokens}")

nanochat/flash_attention.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+Unified Flash Attention interface with automatic FA3/SDPA switching.
+Exports `flash_attn` module that matches the FA3 API exactly, but falls back
+to PyTorch SDPA on non-Hopper GPUs (including Blackwell), MPS, and CPU.
+Usage (drop-in replacement for FA3):
+    from nanochat.flash_attention import flash_attn
+    # Training (no KV cache)
+    y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=window_size)
+    # Inference (with KV cache)
+    y = flash_attn.flash_attn_with_kvcache(q, k_cache, v_cache, k=k, v=v, ...)
+"""
+import torch
+import torch.nn.functional as F
+# =============================================================================
+# Detection: Try to load FA3 on Hopper+ GPUs
+# =============================================================================
+def _load_flash_attention_3():
+    """Try to load Flash Attention 3 (requires Hopper GPU, sm90)."""
+    if not torch.cuda.is_available():
+        return None
+    try:
+        major, _ = torch.cuda.get_device_capability()
+        # FA3 kernels are compiled for Hopper (sm90) only
+        # Ada (sm89), Blackwell (sm100) need SDPA fallback until FA3 is recompiled
+        if major != 9:
+            return None
+        import os
+        os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+        from kernels import get_kernel
+        return get_kernel('varunneal/flash-attention-3').flash_attn_interface
+    except Exception:
+        return None
+_fa3 = _load_flash_attention_3()
+HAS_FA3 = _fa3 is not None
+# Override for testing: set to 'fa3', 'sdpa', or None (auto)
+_override_impl = None
+def _resolve_use_fa3():
+    """Decide once whether to use FA3, based on availability, override, and dtype."""
+    if _override_impl == 'fa3':
+        assert HAS_FA3, "Cannot override to FA3: not available on this hardware"
+        return True
+    if _override_impl == 'sdpa':
+        return False
+    if HAS_FA3:
+        # FA3 Hopper kernels only support bf16 and fp8; fp16/fp32 must use SDPA fallback
+        from nanochat.common import COMPUTE_DTYPE
+        if COMPUTE_DTYPE == torch.bfloat16:
+            return True
+        return False
+    return False
+USE_FA3 = _resolve_use_fa3()
+# =============================================================================
+# SDPA helpers
+# =============================================================================
+def _sdpa_attention(q, k, v, window_size, enable_gqa):
+    """
+    SDPA attention with sliding window support.
+    q, k, v are (B, H, T, D) format.
+    """
+    Tq = q.size(2)
+    Tk = k.size(2)
+    window = window_size[0]
+    # Full context, same length
+    if (window < 0 or window >= Tq) and Tq == Tk:
+        return F.scaled_dot_product_attention(q, k, v, is_causal=True, enable_gqa=enable_gqa)
+    # Single token generation
+    if Tq == 1:
+        if window >= 0 and window < Tk:
+            # window is "left" tokens we need to include (window + 1) keys total
+            start = max(0, Tk - (window + 1))
+            k = k[:, :, start:, :]
+            v = v[:, :, start:, :]
+        return F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=enable_gqa)
+    # Need explicit mask for sliding window/chunk inference
+    device = q.device
+    # For chunk inference (Tq != Tk), is_causal is not aligned to cache position => build an explicit bool mask
+    row_idx = (Tk - Tq) + torch.arange(Tq, device=device).unsqueeze(1)
+    col_idx = torch.arange(Tk, device=device).unsqueeze(0)
+    mask = col_idx <= row_idx
+    # sliding window (left)
+    if window >= 0 and window < Tk:
+        mask = mask & ((row_idx - col_idx) <= window)
+    return F.scaled_dot_product_attention(q, k, v, attn_mask=mask, enable_gqa=enable_gqa)
+# =============================================================================
+# Public API: Same interface as FA3
+# =============================================================================
+def flash_attn_func(q, k, v, causal=False, window_size=(-1, -1)):
+    """
+    Flash Attention for training (no KV cache).
+    Args:
+        q, k, v: Tensors of shape (B, T, H, D)
+        causal: Whether to use causal masking
+        window_size: (left, right) sliding window. -1 means unlimited.
+    Returns:
+        Output tensor of shape (B, T, H, D)
+    """
+    if USE_FA3:
+        return _fa3.flash_attn_func(q, k, v, causal=causal, window_size=window_size)
+    # SDPA fallback: transpose (B, T, H, D) -> (B, H, T, D)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    enable_gqa = q.size(1) != k.size(1)
+    y = _sdpa_attention(q, k, v, window_size, enable_gqa)
+    return y.transpose(1, 2)  # back to (B, T, H, D)
+def flash_attn_with_kvcache(q, k_cache, v_cache, k=None, v=None, cache_seqlens=None,
+                            causal=False, window_size=(-1, -1)):
+    """
+    Flash Attention with KV cache for inference.
+    FA3 updates k_cache/v_cache in-place. Our SDPA fallback does the same.
+    Args:
+        q: Queries, shape (B, T_new, H, D)
+        k_cache, v_cache: Pre-allocated cache tensors, shape (B, T_max, H_kv, D)
+        k, v: New keys/values to insert, shape (B, T_new, H_kv, D)
+        cache_seqlens: Current position in cache, shape (B,) int32
+        causal: Whether to use causal masking
+        window_size: (left, right) sliding window. -1 means unlimited.
+    Returns:
+        Output tensor of shape (B, T_new, H, D)
+    """
+    if USE_FA3:
+        return _fa3.flash_attn_with_kvcache(
+            q, k_cache, v_cache, k=k, v=v, cache_seqlens=cache_seqlens,
+            causal=causal, window_size=window_size
+        )
+    # SDPA fallback: manually manage KV cache
+    B, T_new, H, D = q.shape
+    pos = cache_seqlens[0].item()  # assume uniform position across batch
+    # Insert new k, v into cache (in-place, matching FA3 behavior)
+    if k is not None and v is not None:
+        k_cache[:, pos:pos+T_new, :, :] = k
+        v_cache[:, pos:pos+T_new, :, :] = v
+    # Get full cache up to current position + new tokens
+    end_pos = pos + T_new
+    k_full = k_cache[:, :end_pos, :, :]
+    v_full = v_cache[:, :end_pos, :, :]
+    # Transpose to SDPA layout: (B, T, H, D) -> (B, H, T, D)
+    q_sdpa = q.transpose(1, 2)
+    k_sdpa = k_full.transpose(1, 2)
+    v_sdpa = v_full.transpose(1, 2)
+    enable_gqa = q_sdpa.size(1) != k_sdpa.size(1)
+    y_sdpa = _sdpa_attention(q_sdpa, k_sdpa, v_sdpa, window_size, enable_gqa)
+    return y_sdpa.transpose(1, 2)  # back to (B, T, H, D)
+# =============================================================================
+# Export: flash_attn module interface (drop-in replacement for FA3)
+# =============================================================================
+from types import SimpleNamespace
+flash_attn = SimpleNamespace(
+    flash_attn_func=flash_attn_func,
+    flash_attn_with_kvcache=flash_attn_with_kvcache,
+)

nanochat/gpt.py ADDED Viewed

	@@ -0,0 +1,465 @@

+"""
+GPT model (rewrite, a lot simpler)
+Notable features:
+- rotary embeddings (and no positional embeddings)
+- QK norm
+- untied weights for token embedding and lm_head
+- relu^2 activation in MLP
+- norm after token embedding
+- no learnable params in rmsnorm
+- no bias in linear layers
+- Group-Query Attention (GQA) support for more efficient inference
+- Flash Attention 3 integration
+"""
+from functools import partial
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nanochat.common import get_dist_info, print0, COMPUTE_DTYPE
+from nanochat.optim import MuonAdamW, DistMuonAdamW
+# Our custom Flash Attention module that automatically uses FA3 on Hopper+ and SDPA fallback elsewhere
+from nanochat.flash_attention import flash_attn
+@dataclass
+class GPTConfig:
+    sequence_len: int = 2048
+    vocab_size: int = 32768
+    n_layer: int = 12
+    n_head: int = 6 # number of query heads
+    n_kv_head: int = 6 # number of key/value heads (GQA)
+    n_embd: int = 768
+    # Sliding window attention pattern string, tiled across layers. Final layer always L.
+    # Characters: L=long (full context), S=short (half context)
+    # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long
+    window_pattern: str = "SSSL"
+def norm(x):
+    return F.rms_norm(x, (x.size(-1),)) # note that this will run in bf16, seems ok
+class Linear(nn.Linear):
+    """nn.Linear that casts weights to match input dtype in forward.
+    Replaces autocast: master weights stay fp32 for optimizer precision,
+    but matmuls run in the activation dtype (typically bf16 from embeddings)."""
+    def forward(self, x):
+        return F.linear(x, self.weight.to(dtype=x.dtype))
+def has_ve(layer_idx, n_layer):
+    """Returns True if GPT layer should have Value Embedding (alternating, last layer always included)."""
+    return layer_idx % 2 == (n_layer - 1) % 2
+def apply_rotary_emb(x, cos, sin):
+    assert x.ndim == 4  # multihead attention
+    d = x.shape[3] // 2
+    x1, x2 = x[..., :d], x[..., d:] # split up last dim into two halves
+    y1 = x1 * cos + x2 * sin # rotate pairs of dims
+    y2 = x1 * (-sin) + x2 * cos
+    return torch.cat([y1, y2], 3)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        assert self.n_embd % self.n_head == 0
+        assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0
+        self.c_q = Linear(self.n_embd, self.n_head * self.head_dim, bias=False)
+        self.c_k = Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_v = Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_proj = Linear(self.n_embd, self.n_embd, bias=False)
+        self.ve_gate_channels = 32  # Victorian checkpoint patch
+        self.ve_gate = Linear(self.ve_gate_channels, self.n_kv_head, bias=False) if has_ve(layer_idx, config.n_layer) else None
+    def forward(self, x, ve, cos_sin, window_size, kv_cache):
+        B, T, C = x.size()
+        # Project the input to get queries, keys, and values
+        # Shape: (B, T, H, D) - FA3's native layout, no transpose needed!
+        q = self.c_q(x).view(B, T, self.n_head, self.head_dim)
+        k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim)
+        v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim)
+        # Value residual (ResFormer): mix in value embedding with input-dependent gate per head
+        if ve is not None:
+            ve = ve.view(B, T, self.n_kv_head, self.head_dim)
+            gate = 3 * torch.sigmoid(self.ve_gate(x[..., :self.ve_gate_channels]))  # (B, T, n_kv_head), range (0, 3)
+            v = v + gate.unsqueeze(-1) * ve
+        # Apply Rotary Embeddings to queries and keys to get relative positional encoding
+        cos, sin = cos_sin
+        q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin)
+        q, k = norm(q), norm(k) # QK norm
+        q = q * 1.15  # sharper attention (split scale between Q and K), TODO think through better
+        k = k * 1.15
+        # Flash Attention (FA3 on Hopper+, PyTorch SDPA fallback elsewhere)
+        # window_size is (left, right) tuple: (N, 0) for causal, (-1, 0) for full context
+        if kv_cache is None:
+            # Training: causal attention with optional sliding window
+            y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=window_size)
+        else:
+            # Inference: use flash_attn_with_kvcache which handles cache management
+            k_cache, v_cache = kv_cache.get_layer_cache(self.layer_idx)
+            y = flash_attn.flash_attn_with_kvcache(
+                q, k_cache, v_cache,
+                k=k, v=v,
+                cache_seqlens=kv_cache.cache_seqlens,
+                causal=True,
+                window_size=window_size,
+            )
+            # Advance position after last layer processes
+            if self.layer_idx == kv_cache.n_layers - 1:
+                kv_cache.advance(T)
+        # Re-assemble the heads and project back to residual stream
+        y = y.contiguous().view(B, T, -1)
+        y = self.c_proj(y)
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = Linear(config.n_embd, 4 * config.n_embd, bias=False)
+        self.c_proj = Linear(4 * config.n_embd, config.n_embd, bias=False)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = F.relu(x).square()
+        x = self.c_proj(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.attn = CausalSelfAttention(config, layer_idx)
+        self.mlp = MLP(config)
+    def forward(self, x, ve, cos_sin, window_size, kv_cache):
+        x = x + self.attn(norm(x), ve, cos_sin, window_size, kv_cache)
+        x = x + self.mlp(norm(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, config, pad_vocab_size_to=64):
+        """
+        NOTE a major footgun: this __init__ function runs in meta device context (!!)
+        Therefore, any calculations inside here are shapes and dtypes only, no actual data.
+        => We actually initialize all data (parameters, buffers, etc.) in init_weights() instead.
+        """
+        super().__init__()
+        self.config = config
+        # Compute per-layer window sizes for sliding window attention
+        # window_size is (left, right) tuple: (-1, 0) for full context, (N, 0) for sliding window
+        self.window_sizes = self._compute_window_sizes(config)
+        # Pad vocab for efficiency (DDP, tensor cores). This is just an optimization - outputs are cropped in forward().
+        # https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.resize_token_embeddings
+        padded_vocab_size = ((config.vocab_size + pad_vocab_size_to - 1) // pad_vocab_size_to) * pad_vocab_size_to
+        if padded_vocab_size != config.vocab_size:
+            print0(f"Padding vocab_size from {config.vocab_size} to {padded_vocab_size} for efficiency")
+        self.transformer = nn.ModuleDict({
+            "wte": nn.Embedding(padded_vocab_size, config.n_embd),
+            "h": nn.ModuleList([Block(config, layer_idx) for layer_idx in range(config.n_layer)]),
+        })
+        self.lm_head = Linear(config.n_embd, padded_vocab_size, bias=False)
+        # Per-layer learnable scalars (inspired by modded-nanogpt)
+        # resid_lambdas: scales the residual stream at each layer (init 1.0 = neutral)
+        # x0_lambdas: blends initial embedding back in at each layer (init 0.0 = disabled)
+        # Separate parameters so they can have different optimizer treatment
+        self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer))   # fake init, real init in init_weights()
+        self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer))     # fake init, real init in init_weights()
+        # Value embeddings (ResFormer-style): alternating layers, last layer always included
+        head_dim = config.n_embd // config.n_head
+        kv_dim = config.n_kv_head * head_dim
+        self.value_embeds = nn.ModuleDict({str(i): nn.Embedding(padded_vocab_size, kv_dim) for i in range(config.n_layer) if has_ve(i, config.n_layer)})
+        # To support meta device initialization, we init the rotary embeddings here, but it's just "fake" meta tensors only.
+        # As for rotary_seq_len, these rotary embeddings are pretty small/cheap in memory,
+        # so let's just over-compute them by 10X, but assert fail if we ever reach that amount.
+        # In the future we can dynamically grow the cache, for now it's fine.
+        self.rotary_seq_len = config.sequence_len * 10 # 10X over-compute should be enough, TODO make nicer?
+        head_dim = config.n_embd // config.n_head
+        cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
+        self.register_buffer("cos", cos, persistent=False) # persistent=False means it's not saved to the checkpoint
+        self.register_buffer("sin", sin, persistent=False)
+    @torch.no_grad()
+    def init_weights(self):
+        """
+        Initialize the full model in this one function for maximum clarity.
+        wte (embedding):     normal, std=1.0
+        lm_head:             normal, std=0.001
+        for each block:
+            attn.c_q:        uniform, std=1/sqrt(n_embd)
+            attn.c_k:        uniform, std=1/sqrt(n_embd)
+            attn.c_v:        uniform, std=1/sqrt(n_embd)
+            attn.c_proj:     zeros
+            mlp.c_fc:        uniform, std=1/sqrt(n_embd)
+            mlp.c_proj:      zeros
+        """
+        # Embedding and unembedding
+        torch.nn.init.normal_(self.transformer.wte.weight, mean=0.0, std=0.8)
+        torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.001)
+        # Transformer blocks: uniform init with bound = sqrt(3) * std (same standard deviation as normal)
+        n_embd = self.config.n_embd
+        s = 3**0.5 * n_embd**-0.5 # sqrt(3) multiplier makes sure Uniform achieves the same std as Normal
+        for block in self.transformer.h:
+            torch.nn.init.uniform_(block.attn.c_q.weight, -s, s) # weights use Uniform to avoid outliers
+            torch.nn.init.uniform_(block.attn.c_k.weight, -s, s)
+            torch.nn.init.uniform_(block.attn.c_v.weight, -s, s)
+            torch.nn.init.zeros_(block.attn.c_proj.weight) # projections are zero
+            torch.nn.init.uniform_(block.mlp.c_fc.weight, -s * 0.5, s * 0.5)  # 0.5x init scale for c_fc
+            torch.nn.init.zeros_(block.mlp.c_proj.weight)
+        # Per-layer scalars
+        self.resid_lambdas.fill_(1.0)   # 1.0 => typical residual connections at init
+        self.x0_lambdas.fill_(0.1)      # 0.1 => small initial weight for skip connection to input embedding
+        # Value embeddings (init like c_v: uniform with same std)
+        for ve in self.value_embeds.values():
+            torch.nn.init.uniform_(ve.weight, -s, s)
+        # Gate weights init with small positive values so gates start slightly above neutral
+        for block in self.transformer.h:
+            if block.attn.ve_gate is not None:
+                torch.nn.init.uniform_(block.attn.ve_gate.weight, 0.0, 0.02)
+        # Rotary embeddings
+        head_dim = self.config.n_embd // self.config.n_head
+        cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
+        self.cos, self.sin = cos, sin
+        # Cast embeddings to COMPUTE_DTYPE: optimizer can tolerate reduced-precision
+        # embeddings and it saves memory. Exception: fp16 requires fp32 embeddings
+        # because GradScaler cannot unscale fp16 gradients.
+        if COMPUTE_DTYPE != torch.float16:
+            self.transformer.wte.to(dtype=COMPUTE_DTYPE)
+            for ve in self.value_embeds.values():
+                ve.to(dtype=COMPUTE_DTYPE)
+    def _precompute_rotary_embeddings(self, seq_len, head_dim, base=100000, device=None):
+        # TODO: bump base theta more? e.g. 100K is more common more recently
+        # autodetect the device from model embeddings
+        if device is None:
+            device = self.transformer.wte.weight.device
+        # stride the channels
+        channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+        inv_freq = 1.0 / (base ** (channel_range / head_dim))
+        # stride the time steps
+        t = torch.arange(seq_len, dtype=torch.float32, device=device)
+        # calculate the rotation frequencies at each (time, channel) pair
+        freqs = torch.outer(t, inv_freq)
+        cos, sin = freqs.cos(), freqs.sin()
+        cos, sin = cos.to(COMPUTE_DTYPE), sin.to(COMPUTE_DTYPE)
+        cos, sin = cos[None, :, None, :], sin[None, :, None, :] # add batch and head dims for later broadcasting
+        return cos, sin
+    def _compute_window_sizes(self, config):
+        """
+        Compute per-layer window sizes for sliding window attention.
+        Returns list of (left, right) tuples for FA3's window_size parameter:
+        - left: how many tokens before current position to attend to (-1 = unlimited)
+        - right: how many tokens after current position to attend to (0 for causal)
+        Pattern string is tiled across layers. Final layer always gets L (full context).
+        Characters: L=long (full context), S=short (half context)
+        """
+        pattern = config.window_pattern.upper()
+        assert all(c in "SL" for c in pattern), f"Invalid window_pattern: {pattern}. Use only S and L."
+        # Map characters to window sizes
+        long_window = config.sequence_len
+        short_window = -(-long_window // 3 // 128) * 128  # ceil to FA3 tile size (2048 -> 768)
+        char_to_window = {
+            "L": (long_window, 0),
+            "S": (short_window, 0),
+        }
+        # Tile pattern across layers
+        window_sizes = []
+        for layer_idx in range(config.n_layer):
+            char = pattern[layer_idx % len(pattern)]
+            window_sizes.append(char_to_window[char])
+        # Final layer always gets full context
+        window_sizes[-1] = (long_window, 0)
+        return window_sizes
+    def get_device(self):
+        return self.transformer.wte.weight.device
+    def estimate_flops(self):
+        """
+        Return the estimated FLOPs per token for the model (forward + backward).
+        Each matmul weight parameter contributes 2 FLOPs (multiply *, accumulate +) in forward, and 2X that in backward => 2+4=6.
+        Cleanest explanation of this: https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4
+        On top of that, 12 * h * q * effective_seq_len accounts for key @ query matmul flops inside attention.
+        With sliding windows, effective_seq_len varies per layer (capped by window size).
+        Ref: https://arxiv.org/abs/2204.02311 (PaLM paper).
+        This is ~1% off from the exact formulas of Chinchilla paper, the difference is:
+        - Chinchilla counts the embedding layer as flops (? weird, it's just a lookup => we ignore)
+        - Chinchilla counts exp/sum/divide in attention softmax as flops (a little sus and very tiny => we ignore)
+        """
+        nparams = sum(p.numel() for p in self.parameters())
+        # Exclude non-matmul params: embeddings and per-layer scalars
+        value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds.values())
+        nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel +
+                          self.resid_lambdas.numel() + self.x0_lambdas.numel())
+        h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len
+        # Sum attention FLOPs per layer, accounting for sliding window
+        attn_flops = 0
+        for window_size in self.window_sizes:
+            window = window_size[0]  # (left, right) tuple, we use left
+            effective_seq = t if window < 0 else min(window, t)
+            attn_flops += 12 * h * q * effective_seq
+        num_flops_per_token = 6 * (nparams - nparams_exclude) + attn_flops
+        return num_flops_per_token
+    def num_scaling_params(self):
+        """
+        Return detailed parameter counts for scaling law analysis.
+        Different papers use different conventions:
+        - Kaplan et al. excluded embedding parameters
+        - Chinchilla included all parameters
+        Ref: https://arxiv.org/abs/2203.15556 (Chinchilla paper)
+        Ref: https://arxiv.org/abs/2001.08361 (Kaplan et al. original scaling laws paper)
+        Returns a dict with counts for each parameter group, so downstream analysis
+        can experiment with which combination gives the cleanest scaling laws.
+        """
+        # Count each group separately (mirrors the grouping in setup_optimizers)
+        wte = sum(p.numel() for p in self.transformer.wte.parameters())
+        value_embeds = sum(p.numel() for p in self.value_embeds.parameters())
+        lm_head = sum(p.numel() for p in self.lm_head.parameters())
+        transformer_matrices = sum(p.numel() for p in self.transformer.h.parameters())
+        scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel()
+        total = wte + value_embeds + lm_head + transformer_matrices + scalars
+        assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch"
+        return {
+            'wte': wte,
+            'value_embeds': value_embeds,
+            'lm_head': lm_head,
+            'transformer_matrices': transformer_matrices,
+            'scalars': scalars,
+            'total': total,
+        }
+    def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, scalar_lr=0.5):
+        model_dim = self.config.n_embd
+        ddp, rank, local_rank, world_size = get_dist_info()
+        # Separate out all parameters into groups
+        matrix_params = list(self.transformer.h.parameters())
+        value_embeds_params = list(self.value_embeds.parameters())
+        embedding_params = list(self.transformer.wte.parameters())
+        lm_head_params = list(self.lm_head.parameters())
+        resid_params = [self.resid_lambdas]
+        x0_params = [self.x0_lambdas]
+        assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params)
+        # Scale the LR for the AdamW parameters by ∝1/√dmodel (tuned for 768 dim model)
+        dmodel_lr_scale = (model_dim / 768) ** -0.5
+        print0(f"Scaling the LR for the AdamW parameters ∝1/√({model_dim}/768) = {dmodel_lr_scale:.6f}")
+        # Build param_groups with all required fields explicit
+        param_groups = [
+            # AdamW groups (embeddings, lm_head, scalars)
+            dict(kind='adamw', params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale, betas=(0.8, 0.96), eps=1e-10, weight_decay=0.01),
+            dict(kind='adamw', params=embedding_params, lr=embedding_lr * dmodel_lr_scale, betas=(0.8, 0.995), eps=1e-10, weight_decay=0.001),
+            dict(kind='adamw', params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale * 0.5, betas=(0.8, 0.995), eps=1e-10, weight_decay=0.01),
+            dict(kind='adamw', params=resid_params, lr=scalar_lr * 0.01, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.05),
+            dict(kind='adamw', params=x0_params, lr=scalar_lr, betas=(0.96, 0.95), eps=1e-10, weight_decay=0.0),  # higher beta1 for x0
+        ]
+        # Muon groups (matrix params, grouped by shape for stacking)
+        for shape in sorted({p.shape for p in matrix_params}):
+            group_params = [p for p in matrix_params if p.shape == shape]
+            param_groups.append(dict(
+                kind='muon', params=group_params, lr=matrix_lr,
+                momentum=0.95, ns_steps=5, beta2=0.9, weight_decay=weight_decay,
+            ))
+        Factory = DistMuonAdamW if ddp else MuonAdamW
+        optimizer = Factory(param_groups)
+        for group in optimizer.param_groups:
+            group["initial_lr"] = group["lr"]
+        return optimizer
+    def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'):
+        B, T = idx.size()
+        # Grab the rotary embeddings for the current sequence length (they are of shape (1, seq_len, 1, head_dim/2))
+        assert T <= self.cos.size(1), f"Sequence length grew beyond the rotary embeddings cache: {T} > {self.cos.size(1)}"
+        assert idx.device == self.cos.device, f"Rotary embeddings and idx are on different devices: {idx.device} != {self.cos.device}"
+        assert self.cos.dtype == COMPUTE_DTYPE, f"Rotary embeddings must be in {COMPUTE_DTYPE}, got {self.cos.dtype}"
+        # if kv cache exists, we need to offset the rotary embeddings to the current position in the cache
+        T0 = 0 if kv_cache is None else kv_cache.get_pos()
+        cos_sin = self.cos[:, T0:T0+T], self.sin[:, T0:T0+T] # truncate cache to current sequence length
+        # Forward the trunk of the Transformer
+        x = self.transformer.wte(idx) # embed current token
+        x = x.to(COMPUTE_DTYPE) # ensure activations are in compute dtype (no-op usually, but active for fp16 code path)
+        x = norm(x)
+        x0 = x  # save initial normalized embedding for x0 residual
+        for i, block in enumerate(self.transformer.h):
+            x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0
+            ve = self.value_embeds[str(i)](idx).to(x.dtype) if str(i) in self.value_embeds else None
+            x = block(x, ve, cos_sin, self.window_sizes[i], kv_cache)
+        x = norm(x)
+        # Forward the lm_head (compute logits)
+        softcap = 15 # smoothly cap the logits to the range [-softcap, softcap]
+        logits = self.lm_head(x) # (B, T, padded_vocab_size) <- very big tensor, large amount of memory
+        logits = logits[..., :self.config.vocab_size] # slice to remove padding
+        logits = logits.float() # switch to fp32 for logit softcap and loss computation
+        logits = softcap * torch.tanh(logits / softcap) # squash the logits
+        if targets is not None:
+            # training: given the targets, compute and return the loss
+            # TODO experiment with chunked cross-entropy?
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction)
+            return loss
+        else:
+            # inference: just return the logits directly
+            return logits
+    @torch.inference_mode()
+    def generate(self, tokens, max_tokens, temperature=1.0, top_k=None, seed=42):
+        """
+        Naive autoregressive streaming inference.
+        To make it super simple, let's assume:
+        - batch size is 1
+        - ids and the yielded tokens are simple Python lists and ints
+        """
+        assert isinstance(tokens, list)
+        device = self.get_device()
+        rng = None
+        if temperature > 0:
+            rng = torch.Generator(device=device)
+            rng.manual_seed(seed)
+        ids = torch.tensor([tokens], dtype=torch.long, device=device) # add batch dim
+        for _ in range(max_tokens):
+            logits = self.forward(ids) # (B, T, vocab_size)
+            logits = logits[:, -1, :] # (B, vocab_size)
+            if top_k is not None and top_k > 0:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            if temperature > 0:
+                logits = logits / temperature
+                probs = F.softmax(logits, dim=-1)
+                next_ids = torch.multinomial(probs, num_samples=1, generator=rng)
+            else:
+                next_ids = torch.argmax(logits, dim=-1, keepdim=True)
+            ids = torch.cat((ids, next_ids), dim=1)
+            token = next_ids.item()
+            yield token

nanochat/logo.svg ADDED Viewed

nanochat/optim.py ADDED Viewed

	@@ -0,0 +1,533 @@

+"""
+A nice and efficient mixed AdamW/Muon Combined Optimizer.
+Usually the embeddings and scalars go into AdamW, and the matrix parameters go into Muon.
+Two versions are provided (MuonAdamW, DistMuonAdamW), for single GPU and distributed.
+Addapted from: https://github.com/KellerJordan/modded-nanogpt
+Further contributions from @karpathy and @chrisjmccormick.
+"""
+import torch
+import torch.distributed as dist
+from torch import Tensor
+# -----------------------------------------------------------------------------
+"""
+Good old AdamW optimizer, fused kernel.
+https://arxiv.org/abs/1711.05101
+"""
+@torch.compile(dynamic=False, fullgraph=True)
+def adamw_step_fused(
+    p: Tensor,              # (32768, 768) - parameter tensor
+    grad: Tensor,           # (32768, 768) - gradient, same shape as p
+    exp_avg: Tensor,        # (32768, 768) - first moment, same shape as p
+    exp_avg_sq: Tensor,     # (32768, 768) - second moment, same shape as p
+    step_t: Tensor,         # () - 0-D CPU tensor, step count
+    lr_t: Tensor,           # () - 0-D CPU tensor, learning rate
+    beta1_t: Tensor,        # () - 0-D CPU tensor, beta1
+    beta2_t: Tensor,        # () - 0-D CPU tensor, beta2
+    eps_t: Tensor,          # () - 0-D CPU tensor, epsilon
+    wd_t: Tensor,           # () - 0-D CPU tensor, weight decay
+) -> None:
+    """
+    Fused AdamW step: weight_decay -> momentum_update -> bias_correction -> param_update
+    All in one compiled graph to eliminate Python overhead between ops.
+    The 0-D CPU tensors avoid recompilation when hyperparameter values change.
+    """
+    # Weight decay (decoupled, applied before the update)
+    p.mul_(1 - lr_t * wd_t)
+    # Update running averages (lerp_ is cleaner and fuses well)
+    exp_avg.lerp_(grad, 1 - beta1_t)
+    exp_avg_sq.lerp_(grad.square(), 1 - beta2_t)
+    # Bias corrections
+    bias1 = 1 - beta1_t ** step_t
+    bias2 = 1 - beta2_t ** step_t
+    # Compute update and apply
+    denom = (exp_avg_sq / bias2).sqrt() + eps_t
+    step_size = lr_t / bias1
+    p.add_(exp_avg / denom, alpha=-step_size)
+# -----------------------------------------------------------------------------
+"""
+Muon optimizer adapted and simplified from modded-nanogpt.
+https://github.com/KellerJordan/modded-nanogpt
+Background:
+Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+zero even beyond the point where the iteration no longer converges all the way to one everywhere
+on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+performance at all relative to UV^T, where USV^T = G is the SVD.
+Here, an alternative to Newton-Schulz iteration with potentially better convergence properties:
+Polar Express Sign Method for orthogonalization.
+https://arxiv.org/pdf/2505.16932
+by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower.
+NorMuon variance reduction: per-neuron/column adaptive learning rate that normalizes
+update scales after orthogonalization (Muon's output has non-uniform scales across neurons).
+https://arxiv.org/pdf/2510.05491
+Some of the changes in nanochat implementation:
+- Uses a simpler, more general approach to parameter grouping and stacking
+- Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step
+- Makes no assumptions about model architecture (e.g. that attention weights are fused into QKVO format)
+"""
+# Coefficients for Polar Express (computed for num_iters=5, safety_factor=2e-2, cushion=2)
+# From https://arxiv.org/pdf/2505.16932
+polar_express_coeffs = [
+    (8.156554524902461, -22.48329292557795, 15.878769915207462),
+    (4.042929935166739, -2.808917465908714, 0.5000178451051316),
+    (3.8916678022926607, -2.772484153217685, 0.5060648178503393),
+    (3.285753657755655, -2.3681294933425376, 0.46449024233003106),
+    (2.3465413258596377, -1.7097828382687081, 0.42323551169305323),
+]
+@torch.compile(dynamic=False, fullgraph=True)
+def muon_step_fused(
+    stacked_grads: Tensor,          # (12, 768, 3072) - stacked gradients
+    stacked_params: Tensor,         # (12, 768, 3072) - stacked parameters
+    momentum_buffer: Tensor,        # (12, 768, 3072) - first moment buffer
+    second_momentum_buffer: Tensor, # (12, 768, 1) or (12, 1, 3072) - factored second moment
+    momentum_t: Tensor,             # () - 0-D CPU tensor, momentum coefficient
+    lr_t: Tensor,                   # () - 0-D CPU tensor, learning rate
+    wd_t: Tensor,                   # () - 0-D CPU tensor, weight decay
+    beta2_t: Tensor,                # () - 0-D CPU tensor, beta2 for second moment
+    ns_steps: int,                  # 5 - number of Newton-Schulz/Polar Express iterations
+    red_dim: int,                   # -1 or -2 - reduction dimension for variance
+) -> None:
+    """
+    Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update
+    All in one compiled graph to eliminate Python overhead between ops.
+    Some of the constants are 0-D CPU tensors to avoid recompilation when values change.
+    """
+    # Nesterov momentum
+    momentum = momentum_t.to(stacked_grads.dtype)
+    momentum_buffer.lerp_(stacked_grads, 1 - momentum)
+    g = stacked_grads.lerp_(momentum_buffer, momentum)
+    # Polar express
+    X = g.bfloat16()
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.01 + 1e-6)
+    if g.size(-2) > g.size(-1): # Tall matrix
+        for a, b, c in polar_express_coeffs[:ns_steps]:
+            A = X.mT @ X
+            B = b * A + c * (A @ A)
+            X = a * X + X @ B
+    else: # Wide matrix (original math)
+        for a, b, c in polar_express_coeffs[:ns_steps]:
+            A = X @ X.mT
+            B = b * A + c * (A @ A)
+            X = a * X + B @ X
+    g = X
+    # Variance reduction
+    beta2 = beta2_t.to(g.dtype)
+    v_mean = g.float().square().mean(dim=red_dim, keepdim=True)
+    red_dim_size = g.size(red_dim)
+    v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size
+    v_norm = v_norm_sq.sqrt()
+    second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2)
+    step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt()
+    scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square()
+    v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt()
+    final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10))
+    g = g * final_scale.to(g.dtype)
+    # Cautious weight decay + parameter update
+    lr = lr_t.to(g.dtype)
+    wd = wd_t.to(g.dtype)
+    mask = (g * stacked_params) >= 0
+    stacked_params.sub_(lr * g + lr * wd * stacked_params * mask)
+# -----------------------------------------------------------------------------
+# Single GPU version of the MuonAdamW optimizer.
+# Used mostly for reference, debugging and testing.
+class MuonAdamW(torch.optim.Optimizer):
+    """
+    Combined optimizer: Muon for 2D matrix params, AdamW for others, single GPU version.
+    AdamW - Fused AdamW optimizer step.
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    https://kellerjordan.github.io/posts/muon/
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - The Muon optimizer should not be used for the embedding layer, the final fully connected layer,
+    or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
+    - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
+    Arguments:
+        param_groups: List of dicts, each containing:
+            - 'params': List of parameters
+            - 'kind': 'adamw' or 'muon'
+            - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay'
+            - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay'
+    """
+    def __init__(self, param_groups: list[dict]):
+        super().__init__(param_groups, defaults={})
+        # 0-D CPU tensors to avoid torch.compile recompilation when values change
+        # AdamW tensors
+        self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        # Muon tensors
+        self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+    def _step_adamw(self, group: dict) -> None:
+        """
+        AdamW update for each param in the group individually.
+        Lazy init the state, fill in all 0-D tensors, call the fused kernel.
+        """
+        for p in group['params']:
+            if p.grad is None:
+                continue
+            grad = p.grad
+            state = self.state[p]
+            # State init
+            if not state:
+                state['step'] = 0
+                state['exp_avg'] = torch.zeros_like(p)
+                state['exp_avg_sq'] = torch.zeros_like(p)
+            exp_avg = state['exp_avg']
+            exp_avg_sq = state['exp_avg_sq']
+            state['step'] += 1
+            # Fill 0-D tensors with current values
+            self._adamw_step_t.fill_(state['step'])
+            self._adamw_lr_t.fill_(group['lr'])
+            self._adamw_beta1_t.fill_(group['betas'][0])
+            self._adamw_beta2_t.fill_(group['betas'][1])
+            self._adamw_eps_t.fill_(group['eps'])
+            self._adamw_wd_t.fill_(group['weight_decay'])
+            # Fused update: weight_decay -> momentum -> bias_correction -> param_update
+            adamw_step_fused(
+                p, grad, exp_avg, exp_avg_sq,
+                self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t,
+                self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t,
+            )
+    def _step_muon(self, group: dict) -> None:
+        """
+        Muon update for all params in the group (stacked for efficiency).
+        Lazy init the state, fill in all 0-D tensors, call the fused kernel.
+        """
+        params: list[Tensor] = group['params']
+        if not params:
+            return
+        # Get or create group-level buffers (stored in first param's state for convenience)
+        p = params[0]
+        state = self.state[p]
+        num_params = len(params)
+        shape, device, dtype = p.shape, p.device, p.dtype
+        # Momentum for every individual parameter
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device)
+        momentum_buffer = state["momentum_buffer"]
+        # Second momentum buffer is factored, either per-row or per-column
+        if "second_momentum_buffer" not in state:
+            state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1])
+            state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)
+        second_momentum_buffer = state["second_momentum_buffer"]
+        red_dim = -1 if shape[-2] >= shape[-1] else -2
+        # Stack grads and params (NOTE: this assumes all params have the same shape)
+        stacked_grads = torch.stack([p.grad for p in params])
+        stacked_params = torch.stack(params)
+        # Fill all the 0-D tensors with current values
+        self._muon_momentum_t.fill_(group["momentum"])
+        self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0)
+        self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5)
+        self._muon_wd_t.fill_(group["weight_decay"])
+        # Single fused kernel: momentum -> polar_express -> variance_reduction -> update
+        muon_step_fused(
+            stacked_grads,
+            stacked_params,
+            momentum_buffer,
+            second_momentum_buffer,
+            self._muon_momentum_t,
+            self._muon_lr_t,
+            self._muon_wd_t,
+            self._muon_beta2_t,
+            group["ns_steps"],
+            red_dim,
+        )
+        # Copy back to original params
+        torch._foreach_copy_(params, list(stacked_params.unbind(0)))
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            if group['kind'] == 'adamw':
+                self._step_adamw(group)
+            elif group['kind'] == 'muon':
+                self._step_muon(group)
+            else:
+                raise ValueError(f"Unknown optimizer kind: {group['kind']}")
+# -----------------------------------------------------------------------------
+# Distributed version of the MuonAdamW optimizer.
+# Used for training on multiple GPUs.
+class DistMuonAdamW(torch.optim.Optimizer):
+    """
+    Combined distributed optimizer: Muon for 2D matrix params, AdamW for others.
+    See MuonAdamW for the algorithmic details of each optimizer. This class adds
+    distributed communication to enable multi-GPU training without PyTorch DDP.
+    Design Goals:
+    - Overlap communication with computation (async ops)
+    - Minimize memory by sharding optimizer states across ranks (ZeRO-2 style)
+    - Batch small tensors into single comm ops where possible
+    Communication Pattern (3-phase async):
+    We use a 3-phase structure to maximize overlap between communication and compute:
+        Phase 1: Launch all async reduce ops
+            - Kick off all reduce_scatter/all_reduce operations
+            - Don't wait - let them run in background while we continue
+        Phase 2: Wait for reduces, compute updates, launch gathers
+            - For each group: wait for its reduce, compute the update, launch gather
+            - By processing groups in order, earlier gathers run while later computes happen
+        Phase 3: Wait for gathers, copy back
+            - Wait for all gathers to complete
+            - Copy updated params back to original tensors (Muon only)
+    AdamW Communication (ZeRO-2 style):
+    - Small params (<1024 elements): all_reduce gradients, update full param on each rank.
+      Optimizer state is replicated but these params are tiny (scalars, biases).
+    - Large params: reduce_scatter gradients so each rank gets 1/N of the grad, update
+      only that slice, then all_gather the updated slices. Optimizer state (exp_avg,
+      exp_avg_sq) is sharded - each rank only stores state for its slice.
+      Requires param.shape[0] divisible by world_size.
+    Muon Communication (stacked + chunked):
+    - All params in a Muon group must have the same shape (caller's responsibility).
+    - Stack all K params into a single (K, *shape) tensor for efficient comm.
+    - Divide K params across N ranks: each rank "owns" ceil(K/N) params.
+    - reduce_scatter the stacked grads so each rank gets its chunk.
+    - Each rank computes Muon update only for params it owns.
+    - all_gather the updated params back to all ranks.
+    - Optimizer state (momentum_buffer, second_momentum_buffer) is sharded by chunk.
+    - Padding: if K doesn't divide evenly, we zero-pad to (ceil(K/N) * N) for comm,
+      then ignore the padding when copying back.
+    Buffer Reuse:
+    - For Muon, we allocate stacked_grads for reduce_scatter input, then reuse the
+      same buffer as the output for all_gather (stacked_params). This saves memory
+      since we don't need both buffers simultaneously.
+    Arguments:
+        param_groups: List of dicts, each containing:
+            - 'params': List of parameters
+            - 'kind': 'adamw' or 'muon'
+            - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay'
+            - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay'
+    """
+    def __init__(self, param_groups: list[dict]):
+        super().__init__(param_groups, defaults={})
+        # 0-D CPU tensors to avoid torch.compile recompilation when values change
+        self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+    def _reduce_adamw(self, group: dict, world_size: int) -> dict:
+        """Launch async reduce ops for AdamW group. Returns info dict with per-param infos."""
+        param_infos = {}
+        for p in group['params']:
+            grad = p.grad
+            if p.numel() < 1024:
+                # Small params: all_reduce (no scatter/gather needed)
+                future = dist.all_reduce(grad, op=dist.ReduceOp.AVG, async_op=True).get_future()
+                param_infos[p] = dict(future=future, grad_slice=grad, is_small=True)
+            else:
+                # Large params: reduce_scatter
+                assert grad.shape[0] % world_size == 0, f"AdamW reduce_scatter requires shape[0] ({grad.shape[0]}) divisible by world_size ({world_size})"
+                rank_size = grad.shape[0] // world_size
+                grad_slice = torch.empty_like(grad[:rank_size])
+                future = dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()
+                param_infos[p] = dict(future=future, grad_slice=grad_slice, is_small=False)
+        return dict(param_infos=param_infos)
+    def _reduce_muon(self, group: dict, world_size: int) -> dict:
+        """Launch async reduce op for Muon group. Returns info dict."""
+        params = group['params']
+        chunk_size = (len(params) + world_size - 1) // world_size
+        padded_num_params = chunk_size * world_size
+        p = params[0]
+        shape, device, dtype = p.shape, p.device, p.dtype
+        # Stack grads and zero-pad to padded_num_params
+        grad_stack = torch.stack([p.grad for p in params])
+        stacked_grads = torch.empty(padded_num_params, *shape, dtype=dtype, device=device)
+        stacked_grads[:len(params)].copy_(grad_stack)
+        if len(params) < padded_num_params:
+            stacked_grads[len(params):].zero_()
+        # Reduce_scatter to get this rank's chunk
+        grad_chunk = torch.empty(chunk_size, *shape, dtype=dtype, device=device)
+        future = dist.reduce_scatter_tensor(grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True).get_future()
+        return dict(future=future, grad_chunk=grad_chunk, stacked_grads=stacked_grads, chunk_size=chunk_size)
+    def _compute_adamw(self, group: dict, info: dict, gather_list: list, rank: int, world_size: int) -> None:
+        """Wait for reduce, compute AdamW updates, launch gathers for large params."""
+        param_infos = info['param_infos']
+        for p in group['params']:
+            pinfo = param_infos[p]
+            pinfo['future'].wait()
+            grad_slice = pinfo['grad_slice']
+            state = self.state[p]
+            # For small params, operate on full param; for large, operate on slice
+            if pinfo['is_small']:
+                p_slice = p
+            else:
+                rank_size = p.shape[0] // world_size
+                p_slice = p[rank * rank_size:(rank + 1) * rank_size]
+            # State init
+            if not state:
+                state['step'] = 0
+                state['exp_avg'] = torch.zeros_like(p_slice)
+                state['exp_avg_sq'] = torch.zeros_like(p_slice)
+            state['step'] += 1
+            # Fill 0-D tensors and run fused kernel
+            self._adamw_step_t.fill_(state['step'])
+            self._adamw_lr_t.fill_(group['lr'])
+            self._adamw_beta1_t.fill_(group['betas'][0])
+            self._adamw_beta2_t.fill_(group['betas'][1])
+            self._adamw_eps_t.fill_(group['eps'])
+            self._adamw_wd_t.fill_(group['weight_decay'])
+            adamw_step_fused(
+                p_slice, grad_slice, state['exp_avg'], state['exp_avg_sq'],
+                self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t,
+                self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t,
+            )
+            # Large params need all_gather
+            if not pinfo['is_small']:
+                future = dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future()
+                gather_list.append(dict(future=future, params=None))
+    def _compute_muon(self, group: dict, info: dict, gather_list: list, rank: int) -> None:
+        """Wait for reduce, compute Muon updates, launch gather."""
+        info['future'].wait()
+        params = group['params']
+        chunk_size = info['chunk_size']
+        grad_chunk = info['grad_chunk']
+        p = params[0]
+        shape, device, dtype = p.shape, p.device, p.dtype
+        # How many params does this rank own?
+        start_idx = rank * chunk_size
+        num_owned = min(chunk_size, max(0, len(params) - start_idx))
+        # Get or create group-level state
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros(chunk_size, *shape, dtype=dtype, device=device)
+        if "second_momentum_buffer" not in state:
+            state_shape = (chunk_size, shape[-2], 1) if shape[-2] >= shape[-1] else (chunk_size, 1, shape[-1])
+            state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)
+        red_dim = -1 if shape[-2] >= shape[-1] else -2
+        # Build output buffer for all_gather
+        updated_params = torch.empty(chunk_size, *shape, dtype=dtype, device=device)
+        if num_owned > 0:
+            owned_params = [params[start_idx + i] for i in range(num_owned)]
+            stacked_owned = torch.stack(owned_params)
+            # Fill 0-D tensors and run fused kernel
+            self._muon_momentum_t.fill_(group["momentum"])
+            self._muon_beta2_t.fill_(group["beta2"])
+            self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5)
+            self._muon_wd_t.fill_(group["weight_decay"])
+            muon_step_fused(
+                grad_chunk[:num_owned], stacked_owned,
+                state["momentum_buffer"][:num_owned], state["second_momentum_buffer"][:num_owned],
+                self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t, self._muon_beta2_t,
+                group["ns_steps"], red_dim,
+            )
+            updated_params[:num_owned].copy_(stacked_owned)
+        if num_owned < chunk_size:
+            updated_params[num_owned:].zero_()
+        # Reuse stacked_grads buffer for all_gather output
+        stacked_params = info["stacked_grads"]
+        future = dist.all_gather_into_tensor(stacked_params, updated_params, async_op=True).get_future()
+        gather_list.append(dict(future=future, stacked_params=stacked_params, params=params))
+    def _finish_gathers(self, gather_list: list) -> None:
+        """Wait for all gathers and copy Muon params back."""
+        for info in gather_list:
+            info["future"].wait()
+            if info["params"] is not None:
+                # Muon: copy from stacked buffer back to individual params
+                torch._foreach_copy_(info["params"], list(info["stacked_params"][:len(info["params"])].unbind(0)))
+    @torch.no_grad()
+    def step(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        # Phase 1: launch all async reduce ops
+        reduce_infos: list[dict] = []
+        for group in self.param_groups:
+            if group['kind'] == 'adamw':
+                reduce_infos.append(self._reduce_adamw(group, world_size))
+            elif group['kind'] == 'muon':
+                reduce_infos.append(self._reduce_muon(group, world_size))
+            else:
+                raise ValueError(f"Unknown optimizer kind: {group['kind']}")
+        # Phase 2: wait for reduces, compute updates, launch gathers
+        gather_list: list[dict] = []
+        for group, info in zip(self.param_groups, reduce_infos):
+            if group['kind'] == 'adamw':
+                self._compute_adamw(group, info, gather_list, rank, world_size)
+            elif group['kind'] == 'muon':
+                self._compute_muon(group, info, gather_list, rank)
+            else:
+                raise ValueError(f"Unknown optimizer kind: {group['kind']}")
+        # Phase 3: wait for gathers, copy back
+        self._finish_gathers(gather_list)

nanochat/tokenizer.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+Tokenizer module — patched for Victorian LLM HuggingFace Space.
+Delegates to tokenizer_wrapper.py which provides the VictorianTokenizer class.
+"""
+import sys
+import os
+# Ensure the app root is on the path so tokenizer_wrapper can be imported
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from tokenizer_wrapper import get_tokenizer, get_token_bytes
+__all__ = ["get_tokenizer", "get_token_bytes"]

nanochat/ui.html ADDED Viewed

	@@ -0,0 +1,566 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
+    <title>NanoChat</title>
+    <link rel="icon" type="image/svg+xml" href="/logo.svg">
+    <style>
+        :root {
+            color-scheme: light;
+        }
+        * {
+            box-sizing: border-box;
+        }
+        html, body{
+            height: 100%;
+            margin: 0;
+        }
+        body {
+            font-family: ui-sans-serif, -apple-system, system-ui, "Segoe UI", Helvetica, "Apple Color Emoji", Arial, sans-serif, "Segoe UI Emoji", "Segoe UI Symbol";
+            background-color: #ffffff;
+            color: #111827;
+            min-height: 100dvh;
+            margin: 0;
+            display: flex;
+            flex-direction: column;
+        }
+        .header {
+            background-color: #ffffff;
+            padding: 1.25rem 1.5rem;
+        }
+        .header-left {
+            display: flex;
+            align-items: center;
+            gap: 0.75rem;
+        }
+        .header-logo {
+            height: 32px;
+            width: auto;
+        }
+        .header h1 {
+            font-size: 1.25rem;
+            font-weight: 600;
+            margin: 0;
+            color: #111827;
+        }
+        .new-conversation-btn {
+            width: 32px;
+            height: 32px;
+            padding: 0;
+            border: 1px solid #e5e7eb;
+            border-radius: 0.5rem;
+            background-color: #ffffff;
+            color: #6b7280;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            transition: all 0.2s ease;
+        }
+        .new-conversation-btn:hover {
+            background-color: #f3f4f6;
+            border-color: #d1d5db;
+            color: #374151;
+        }
+        .chat-container {
+            flex: 1;
+            overflow-y: auto;
+            background-color: #ffffff;
+        }
+        .chat-wrapper {
+            max-width: 48rem;
+            margin: 0 auto;
+            padding: 2rem 1.5rem 3rem;
+            display: flex;
+            flex-direction: column;
+            gap: 0.75rem;
+        }
+        .message {
+            display: flex;
+            justify-content: flex-start;
+            margin-bottom: 0.5rem;
+            color: #0d0d0d;
+        }
+        .message.assistant {
+            justify-content: flex-start;
+        }
+        .message.user {
+            justify-content: flex-end;
+        }
+        .message-content {
+            white-space: pre-wrap;
+            line-height: 1.6;
+            max-width: 100%;
+        }
+        .message.assistant .message-content {
+            background: transparent;
+            border: none;
+            cursor: pointer;
+            border-radius: 0.5rem;
+            padding: 0.5rem;
+            margin-left: -0.5rem;
+            transition: background-color 0.2s ease;
+        }
+        .message.assistant .message-content:hover {
+            background-color: #f9fafb;
+        }
+        .message.user .message-content {
+            background-color: #f3f4f6;
+            border-radius: 1.25rem;
+            padding: 0.8rem 1rem;
+            max-width: 65%;
+            cursor: pointer;
+            transition: background-color 0.2s ease;
+        }
+        .message.user .message-content:hover {
+            background-color: #e5e7eb;
+        }
+        .message.console .message-content {
+            font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', 'Consolas', 'Courier New', monospace;
+            font-size: 0.875rem;
+            background-color: #fafafa;
+            padding: 0.75rem 1rem;
+            color: #374151;
+            max-width: 80%;
+        }
+        .input-container {
+            background-color: #ffffff;
+            padding: 1rem;
+            padding-bottom: calc(1rem + env(safe-area-inset-bottom))
+        }
+        .input-wrapper {
+            max-width: 48rem;
+            margin: 0 auto;
+            display: flex;
+            gap: 0.75rem;
+            align-items: flex-end;
+        }
+        .chat-input {
+            flex: 1;
+            padding: 0.8rem 1rem;
+            border: 1px solid #d1d5db;
+            border-radius: 0.75rem;
+            background-color: #ffffff;
+            color: #111827;
+            font-size: 1rem;
+            line-height: 1.5;
+            resize: none;
+            outline: none;
+            min-height: 54px;
+            max-height: 200px;
+            transition: border-color 0.2s ease, box-shadow 0.2s ease;
+        }
+        .chat-input::placeholder {
+            color: #9ca3af;
+        }
+        .chat-input:focus {
+            border-color: #2563eb;
+            box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+        }
+        .send-button {
+            flex-shrink: 0;
+            padding: 0;
+            width: 54px;
+            height: 54px;
+            border: 1px solid #111827;
+            border-radius: 0.75rem;
+            background-color: #111827;
+            color: #ffffff;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            cursor: pointer;
+            transition: background-color 0.2s ease, border-color 0.2s ease, color 0.2s ease;
+        }
+        .send-button:hover:not(:disabled) {
+            background-color: #2563eb;
+            border-color: #2563eb;
+        }
+        .send-button:disabled {
+            cursor: not-allowed;
+            border-color: #d1d5db;
+            background-color: #e5e7eb;
+            color: #9ca3af;
+        }
+        .typing-indicator {
+            display: inline-block;
+            color: #6b7280;
+            letter-spacing: 0.15em;
+        }
+        .typing-indicator::after {
+            content: '···';
+            animation: typing 1.4s infinite;
+        }
+        @keyframes typing {
+            0%, 60%, 100% { opacity: 0.2; }
+            30% { opacity: 1; }
+        }
+        .error-message {
+            background-color: #fee2e2;
+            border: 1px solid #fecaca;
+            color: #b91c1c;
+            padding: 0.75rem 1rem;
+            border-radius: 0.75rem;
+            margin-top: 0.5rem;
+        }
+    </style>
+</head>
+<body>
+    <div class="header">
+        <div class="header-left">
+            <button class="new-conversation-btn" onclick="newConversation()" title="New Conversation (Ctrl+Shift+N)">
+                <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+                    <path d="M12 5v14"></path>
+                    <path d="M5 12h14"></path>
+                </svg>
+            </button>
+            <h1>nanochat</h1>
+        </div>
+    </div>
+    <div class="chat-container" id="chatContainer">
+        <div class="chat-wrapper" id="chatWrapper">
+            <!-- Messages will be added here -->
+        </div>
+    </div>
+    <div class="input-container">
+        <div class="input-wrapper">
+            <textarea
+                id="chatInput"
+                class="chat-input"
+                placeholder="Ask anything"
+                rows="1"
+                onkeydown="handleKeyDown(event)"
+            ></textarea>
+            <button id="sendButton" class="send-button" onclick="sendMessage()" disabled>
+                <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+                    <path d="M22 2L11 13"></path>
+                    <path d="M22 2l-7 20-4-9-9-4 20-7z"></path>
+                </svg>
+            </button>
+        </div>
+    </div>
+    <script>
+        const API_URL = '';
+        const chatContainer = document.getElementById('chatContainer');
+        const chatWrapper = document.getElementById('chatWrapper');
+        const chatInput = document.getElementById('chatInput');
+        const sendButton = document.getElementById('sendButton');
+        let messages = [];
+        let isGenerating = false;
+        let currentTemperature = 0.8;
+        let currentTopK = 50;
+        chatInput.addEventListener('input', function() {
+            this.style.height = 'auto';
+            this.style.height = Math.min(this.scrollHeight, 200) + 'px';
+            sendButton.disabled = !this.value.trim() || isGenerating;
+        });
+        function handleKeyDown(event) {
+            if (event.key === 'Enter' && !event.shiftKey) {
+                event.preventDefault();
+                sendMessage();
+            }
+        }
+        document.addEventListener('keydown', function(event) {
+            // Ctrl+Shift+N for new conversation
+            if (event.ctrlKey && event.shiftKey && event.key === 'N') {
+                event.preventDefault();
+                if (!isGenerating) {
+                    newConversation();
+                }
+            }
+        });
+        function newConversation() {
+            messages = [];
+            chatWrapper.innerHTML = '';
+            chatInput.value = '';
+            chatInput.style.height = 'auto';
+            sendButton.disabled = false;
+            isGenerating = false;
+            chatInput.focus();
+        }
+        function addMessage(role, content, messageIndex = null) {
+            const messageDiv = document.createElement('div');
+            messageDiv.className = `message ${role}`;
+            const contentDiv = document.createElement('div');
+            contentDiv.className = 'message-content';
+            contentDiv.textContent = content;
+            // Add click handler for user messages to enable editing
+            if (role === 'user' && messageIndex !== null) {
+                contentDiv.setAttribute('data-message-index', messageIndex);
+                contentDiv.setAttribute('title', 'Click to edit and restart from here');
+                contentDiv.addEventListener('click', function() {
+                    if (!isGenerating) {
+                        editMessage(messageIndex);
+                    }
+                });
+            }
+            // Add click handler for assistant messages to enable regeneration
+            if (role === 'assistant' && messageIndex !== null) {
+                contentDiv.setAttribute('data-message-index', messageIndex);
+                contentDiv.setAttribute('title', 'Click to regenerate this response');
+                contentDiv.addEventListener('click', function() {
+                    if (!isGenerating) {
+                        regenerateMessage(messageIndex);
+                    }
+                });
+            }
+            messageDiv.appendChild(contentDiv);
+            chatWrapper.appendChild(messageDiv);
+            chatContainer.scrollTop = chatContainer.scrollHeight;
+            return contentDiv;
+        }
+        function editMessage(messageIndex) {
+            // Find the message in the messages array
+            if (messageIndex < 0 || messageIndex >= messages.length) return;
+            const messageToEdit = messages[messageIndex];
+            if (messageToEdit.role !== 'user') return;
+            // Copy message content to input
+            chatInput.value = messageToEdit.content;
+            chatInput.style.height = 'auto';
+            chatInput.style.height = Math.min(chatInput.scrollHeight, 200) + 'px';
+            // Remove this message and all subsequent messages from the array
+            messages = messages.slice(0, messageIndex);
+            // Remove message elements from DOM starting from messageIndex
+            const allMessages = chatWrapper.querySelectorAll('.message');
+            for (let i = messageIndex; i < allMessages.length; i++) {
+                allMessages[i].remove();
+            }
+            // Enable send button and focus input
+            sendButton.disabled = false;
+            chatInput.focus();
+        }
+        async function generateAssistantResponse() {
+            isGenerating = true;
+            sendButton.disabled = true;
+            const assistantContent = addMessage('assistant', '');
+            assistantContent.innerHTML = '<span class="typing-indicator"></span>';
+            try {
+                const response = await fetch(`${API_URL}/chat/completions`, {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        messages: messages,
+                        temperature: currentTemperature,
+                        top_k: currentTopK,
+                        max_tokens: 512
+                    }),
+                });
+                if (!response.ok) {
+                    throw new Error(`HTTP error! status: ${response.status}`);
+                }
+                const reader = response.body.getReader();
+                const decoder = new TextDecoder();
+                let fullResponse = '';
+                assistantContent.textContent = '';
+                while (true) {
+                    const { done, value } = await reader.read();
+                    if (done) break;
+                    const chunk = decoder.decode(value);
+                    const lines = chunk.split('\n');
+                    for (const line of lines) {
+                        if (line.startsWith('data: ')) {
+                            try {
+                                const data = JSON.parse(line.slice(6));
+                                if (data.token) {
+                                    fullResponse += data.token;
+                                    assistantContent.textContent = fullResponse;
+                                    chatContainer.scrollTop = chatContainer.scrollHeight;
+                                }
+                            } catch (e) {
+                            }
+                        }
+                    }
+                }
+                const assistantMessageIndex = messages.length;
+                messages.push({ role: 'assistant', content: fullResponse });
+                // Add click handler to regenerate this assistant message
+                assistantContent.setAttribute('data-message-index', assistantMessageIndex);
+                assistantContent.setAttribute('title', 'Click to regenerate this response');
+                assistantContent.addEventListener('click', function() {
+                    if (!isGenerating) {
+                        regenerateMessage(assistantMessageIndex);
+                    }
+                });
+            } catch (error) {
+                console.error('Error:', error);
+                assistantContent.innerHTML = `<div class="error-message">Error: ${error.message}</div>`;
+            } finally {
+                isGenerating = false;
+                sendButton.disabled = !chatInput.value.trim();
+            }
+        }
+        async function regenerateMessage(messageIndex) {
+            // Find the message in the messages array
+            if (messageIndex < 0 || messageIndex >= messages.length) return;
+            const messageToRegenerate = messages[messageIndex];
+            if (messageToRegenerate.role !== 'assistant') return;
+            // Remove this message and all subsequent messages from the array
+            messages = messages.slice(0, messageIndex);
+            // Remove message elements from DOM starting from messageIndex
+            const allMessages = chatWrapper.querySelectorAll('.message');
+            for (let i = messageIndex; i < allMessages.length; i++) {
+                allMessages[i].remove();
+            }
+            // Regenerate the assistant response
+            await generateAssistantResponse();
+        }
+        function handleSlashCommand(command) {
+            const parts = command.trim().split(/\s+/);
+            const cmd = parts[0].toLowerCase();
+            const arg = parts[1];
+            if (cmd === '/temperature') {
+                if (arg === undefined) {
+                    addMessage('console', `Current temperature: ${currentTemperature}`);
+                } else {
+                    const temp = parseFloat(arg);
+                    if (isNaN(temp) || temp < 0 || temp > 2) {
+                        addMessage('console', 'Invalid temperature. Must be between 0.0 and 2.0');
+                    } else {
+                        currentTemperature = temp;
+                        addMessage('console', `Temperature set to ${currentTemperature}`);
+                    }
+                }
+                return true;
+            } else if (cmd === '/topk') {
+                if (arg === undefined) {
+                    addMessage('console', `Current top-k: ${currentTopK}`);
+                } else {
+                    const topk = parseInt(arg);
+                    if (isNaN(topk) || topk < 1 || topk > 200) {
+                        addMessage('console', 'Invalid top-k. Must be between 1 and 200');
+                    } else {
+                        currentTopK = topk;
+                        addMessage('console', `Top-k set to ${currentTopK}`);
+                    }
+                }
+                return true;
+            } else if (cmd === '/clear') {
+                newConversation();
+                return true;
+            } else if (cmd === '/help') {
+                addMessage('console',
+                    'Available commands:\n' +
+                    '/temperature - Show current temperature\n' +
+                    '/temperature <value> - Set temperature (0.0-2.0)\n' +
+                    '/topk - Show current top-k\n' +
+                    '/topk <value> - Set top-k (1-200)\n' +
+                    '/clear - Clear conversation\n' +
+                    '/help - Show this help message'
+                );
+                return true;
+            }
+            return false;
+        }
+        async function sendMessage() {
+            const message = chatInput.value.trim();
+            if (!message || isGenerating) return;
+            // Handle slash commands
+            if (message.startsWith('/')) {
+                chatInput.value = '';
+                chatInput.style.height = 'auto';
+                handleSlashCommand(message);
+                return;
+            }
+            chatInput.value = '';
+            chatInput.style.height = 'auto';
+            const userMessageIndex = messages.length;
+            messages.push({ role: 'user', content: message });
+            addMessage('user', message, userMessageIndex);
+            await generateAssistantResponse();
+        }
+        sendButton.disabled = false;
+        // Autofocus the chat input on page load
+        chatInput.focus();
+        fetch(`${API_URL}/health`)
+            .then(response => response.json())
+            .then(data => {
+                console.log('Engine status:', data);
+            })
+            .catch(error => {
+                console.error('Engine not available:', error);
+                chatWrapper.innerHTML = '<div class="error-message">Engine not running. Please start engine.py first.</div>';
+            });
+    </script>
+</body>
+</html>

scripts/__init__.py ADDED Viewed

File without changes

scripts/chat_web.py ADDED Viewed

	@@ -0,0 +1,421 @@

+#!/usr/bin/env python3
+"""
+Unified web chat server - serves both UI and API from a single FastAPI instance.
+Uses data parallelism to distribute requests across multiple GPUs. Each GPU loads
+a full copy of the model, and incoming requests are distributed to available workers.
+Launch examples:
+- single available GPU (default)
+python -m scripts.chat_web
+- 4 GPUs
+python -m scripts.chat_web --num-gpus 4
+To chat, open the URL printed in the console. (If on cloud box, make sure to use public IP)
+Endpoints:
+  GET  /           - Chat UI
+  POST /chat/completions - Chat API (streaming only)
+  GET  /health     - Health check with worker pool status
+  GET  /stats      - Worker pool statistics and GPU utilization
+Abuse Prevention:
+  - Maximum 500 messages per request
+  - Maximum 8000 characters per message
+  - Maximum 32000 characters total conversation length
+  - Temperature clamped to 0.0-2.0
+  - Top-k clamped to 0-200 (0 disables top-k filtering, using full vocabulary)
+  - Max tokens clamped to 1-4096
+"""
+import argparse
+import json
+import os
+import torch
+import asyncio
+import logging
+import random
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
+from pydantic import BaseModel
+from typing import List, Optional, AsyncGenerator
+from dataclasses import dataclass
+from nanochat.common import compute_init, autodetect_device_type
+from nanochat.checkpoint_manager import load_model
+from nanochat.engine import Engine
+# Victorian system prompt — prepended to first user turn during inference
+SYSTEM_PREFIX = (
+    "[You are a learned Victorian gentleman in conversation. "
+    "Address the question or remark put to you directly.]\n\n"
+)
+# Abuse prevention limits
+MAX_MESSAGES_PER_REQUEST = 500
+MAX_MESSAGE_LENGTH = 8000
+MAX_TOTAL_CONVERSATION_LENGTH = 32000
+MIN_TEMPERATURE = 0.0
+MAX_TEMPERATURE = 2.0
+MIN_TOP_K = 0 # 0 disables top-k filtering, using full vocabulary
+MAX_TOP_K = 200
+MIN_MAX_TOKENS = 1
+MAX_MAX_TOKENS = 4096
+parser = argparse.ArgumentParser(description='NanoChat Web Server')
+parser.add_argument('-n', '--num-gpus', type=int, default=1, help='Number of GPUs to use (default: 1)')
+parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|rl")
+parser.add_argument('-t', '--temperature', type=float, default=0.7, help='Default temperature for generation')
+parser.add_argument('-k', '--top-k', type=int, default=50, help='Default top-k sampling parameter')
+parser.add_argument('-m', '--max-tokens', type=int, default=512, help='Default max tokens for generation')
+parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
+parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
+parser.add_argument('-p', '--port', type=int, default=8000, help='Port to run the server on')
+parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
+parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to bind the server to')
+args = parser.parse_args()
+# Configure logging for conversation traffic
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+device_type = autodetect_device_type() if args.device_type == "" else args.device_type
+ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
+@dataclass
+class Worker:
+    """A worker with a model loaded on a specific GPU."""
+    gpu_id: int
+    device: torch.device
+    engine: Engine
+    tokenizer: object
+class WorkerPool:
+    """Pool of workers, each with a model replica on a different GPU."""
+    def __init__(self, num_gpus: Optional[int] = None):
+        if num_gpus is None:
+            if device_type == "cuda":
+                num_gpus = torch.cuda.device_count()
+            else:
+                num_gpus = 1 # e.g. cpu|mps
+        self.num_gpus = num_gpus
+        self.workers: List[Worker] = []
+        self.available_workers: asyncio.Queue = asyncio.Queue()
+    async def initialize(self, source: str, model_tag: Optional[str] = None, step: Optional[int] = None):
+        """Load model on each GPU."""
+        print(f"Initializing worker pool with {self.num_gpus} GPUs...")
+        if self.num_gpus > 1:
+            assert device_type == "cuda", "Only CUDA supports multiple workers/GPUs. cpu|mps does not."
+        for gpu_id in range(self.num_gpus):
+            if device_type == "cuda":
+                device = torch.device(f"cuda:{gpu_id}")
+                print(f"Loading model on GPU {gpu_id}...")
+            else:
+                device = torch.device(device_type) # e.g. cpu|mps
+                print(f"Loading model on {device_type}...")
+            model, tokenizer, _ = load_model(source, device, phase="eval", model_tag=model_tag, step=step)
+            engine = Engine(model, tokenizer)
+            worker = Worker(
+                gpu_id=gpu_id,
+                device=device,
+                engine=engine,
+                tokenizer=tokenizer,
+            )
+            self.workers.append(worker)
+            await self.available_workers.put(worker)
+        print(f"All {self.num_gpus} workers initialized!")
+    async def acquire_worker(self) -> Worker:
+        """Get an available worker from the pool."""
+        return await self.available_workers.get()
+    async def release_worker(self, worker: Worker):
+        """Return a worker to the pool."""
+        await self.available_workers.put(worker)
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage]
+    temperature: Optional[float] = None
+    max_tokens: Optional[int] = None
+    top_k: Optional[int] = None
+def validate_chat_request(request: ChatRequest):
+    """Validate chat request to prevent abuse."""
+    # Check number of messages
+    if len(request.messages) == 0:
+        raise HTTPException(status_code=400, detail="At least one message is required")
+    if len(request.messages) > MAX_MESSAGES_PER_REQUEST:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Too many messages. Maximum {MAX_MESSAGES_PER_REQUEST} messages allowed per request"
+        )
+    # Check individual message lengths and total conversation length
+    total_length = 0
+    for i, message in enumerate(request.messages):
+        if not message.content:
+            raise HTTPException(status_code=400, detail=f"Message {i} has empty content")
+        msg_length = len(message.content)
+        if msg_length > MAX_MESSAGE_LENGTH:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Message {i} is too long. Maximum {MAX_MESSAGE_LENGTH} characters allowed per message"
+            )
+        total_length += msg_length
+    if total_length > MAX_TOTAL_CONVERSATION_LENGTH:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Total conversation is too long. Maximum {MAX_TOTAL_CONVERSATION_LENGTH} characters allowed"
+        )
+    # Validate role values
+    for i, message in enumerate(request.messages):
+        if message.role not in ["user", "assistant"]:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Message {i} has invalid role. Must be 'user', 'assistant', or 'system'"
+            )
+    # Validate temperature
+    if request.temperature is not None:
+        if not (MIN_TEMPERATURE <= request.temperature <= MAX_TEMPERATURE):
+            raise HTTPException(
+                status_code=400,
+                detail=f"Temperature must be between {MIN_TEMPERATURE} and {MAX_TEMPERATURE}"
+            )
+    # Validate top_k
+    if request.top_k is not None:
+        if not (MIN_TOP_K <= request.top_k <= MAX_TOP_K):
+            raise HTTPException(
+                status_code=400,
+                detail=f"top_k must be between {MIN_TOP_K} and {MAX_TOP_K}"
+            )
+    # Validate max_tokens
+    if request.max_tokens is not None:
+        if not (MIN_MAX_TOKENS <= request.max_tokens <= MAX_MAX_TOKENS):
+            raise HTTPException(
+                status_code=400,
+                detail=f"max_tokens must be between {MIN_MAX_TOKENS} and {MAX_MAX_TOKENS}"
+            )
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load models on all GPUs on startup."""
+    print("Loading nanochat models across GPUs...")
+    app.state.worker_pool = WorkerPool(num_gpus=args.num_gpus)
+    await app.state.worker_pool.initialize(args.source, model_tag=args.model_tag, step=args.step)
+    print(f"Server ready at http://localhost:{args.port}")
+    yield
+app = FastAPI(lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def root():
+    """Serve the chat UI."""
+    ui_html_path = os.path.join("nanochat", "ui.html")
+    with open(ui_html_path, "r", encoding="utf-8") as f:
+        html_content = f.read()
+    # Replace the API_URL to use the same origin
+    html_content = html_content.replace(
+        "const API_URL = `http://${window.location.hostname}:8000`;",
+        "const API_URL = '';"
+    )
+    return HTMLResponse(content=html_content)
+@app.get("/logo.svg")
+async def logo():
+    """Serve the NanoChat logo for favicon and header."""
+    logo_path = os.path.join("nanochat", "logo.svg")
+    return FileResponse(logo_path, media_type="image/svg+xml")
+async def generate_stream(
+    worker: Worker,
+    tokens,
+    temperature=None,
+    max_new_tokens=None,
+    top_k=None
+) -> AsyncGenerator[str, None]:
+    """Generate assistant response with streaming."""
+    temperature = temperature if temperature is not None else args.temperature
+    max_new_tokens = max_new_tokens if max_new_tokens is not None else args.max_tokens
+    top_k = top_k if top_k is not None else args.top_k
+    assistant_end = worker.tokenizer.encode_special("<|assistant_end|>")
+    bos = worker.tokenizer.get_bos_token_id()
+    # Accumulate tokens to properly handle multi-byte UTF-8 characters (like emojis)
+    accumulated_tokens = []
+    # Track the last complete UTF-8 string (without replacement characters)
+    last_clean_text = ""
+    for token_column, token_masks in worker.engine.generate(
+        tokens,
+        num_samples=1,
+        max_tokens=max_new_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        repetition_penalty=1.3,
+        repetition_window=64,
+        seed=random.randint(0, 2**31 - 1)
+    ):
+        token = token_column[0]
+        # Stopping criteria
+        if token == assistant_end or token == bos:
+            break
+        # Append the token to sequence
+        accumulated_tokens.append(token)
+        # Decode all accumulated tokens to get proper UTF-8 handling
+        # Note that decode is a quite efficient operation, basically table lookup and string concat
+        current_text = worker.tokenizer.decode(accumulated_tokens)
+        # Only emit text if it doesn't end with a replacement character
+        # This ensures we don't emit incomplete UTF-8 sequences
+        if not current_text.endswith('�'):
+            # Extract only the new text since last clean decode
+            new_text = current_text[len(last_clean_text):]
+            if new_text:  # Only yield if there's new content
+                yield f"data: {json.dumps({'token': new_text, 'gpu': worker.gpu_id}, ensure_ascii=False)}\n\n"
+                last_clean_text = current_text
+    yield f"data: {json.dumps({'done': True})}\n\n"
+@app.post("/chat/completions")
+async def chat_completions(request: ChatRequest):
+    """Chat completion endpoint (streaming only) - uses worker pool for multi-GPU."""
+    # Basic validation to prevent abuse
+    validate_chat_request(request)
+    # Log incoming conversation to console
+    logger.info("="*20)
+    for i, message in enumerate(request.messages):
+        logger.info(f"[{message.role.upper()}]: {message.content}")
+    logger.info("-"*20)
+    # Acquire a worker from the pool (will wait if all are busy)
+    worker_pool = app.state.worker_pool
+    worker = await worker_pool.acquire_worker()
+    try:
+        # Build conversation tokens
+        bos = worker.tokenizer.get_bos_token_id()
+        user_start = worker.tokenizer.encode_special("<|user_start|>")
+        user_end = worker.tokenizer.encode_special("<|user_end|>")
+        assistant_start = worker.tokenizer.encode_special("<|assistant_start|>")
+        assistant_end = worker.tokenizer.encode_special("<|assistant_end|>")
+        conversation_tokens = [bos]
+        turn_count = 0
+        for message in request.messages:
+            if message.role == "user":
+                content = message.content
+                # Prepend system prompt to the first user turn
+                if turn_count == 0:
+                    content = SYSTEM_PREFIX + content
+                conversation_tokens.append(user_start)
+                conversation_tokens.extend(worker.tokenizer.encode(content))
+                conversation_tokens.append(user_end)
+                turn_count += 1
+            elif message.role == "assistant":
+                conversation_tokens.append(assistant_start)
+                conversation_tokens.extend(worker.tokenizer.encode(message.content))
+                conversation_tokens.append(assistant_end)
+        conversation_tokens.append(assistant_start)
+        # Streaming response with worker release after completion
+        response_tokens = []
+        async def stream_and_release():
+            try:
+                async for chunk in generate_stream(
+                    worker,
+                    conversation_tokens,
+                    temperature=request.temperature,
+                    max_new_tokens=request.max_tokens,
+                    top_k=request.top_k
+                ):
+                    # Accumulate response for logging
+                    chunk_data = json.loads(chunk.replace("data: ", "").strip())
+                    if "token" in chunk_data:
+                        response_tokens.append(chunk_data["token"])
+                    yield chunk
+            finally:
+                # Log the assistant response to console
+                full_response = "".join(response_tokens)
+                logger.info(f"[ASSISTANT] (GPU {worker.gpu_id}): {full_response}")
+                logger.info("="*20)
+                # Release worker back to pool after streaming is done
+                await worker_pool.release_worker(worker)
+        return StreamingResponse(
+            stream_and_release(),
+            media_type="text/event-stream"
+        )
+    except Exception as e:
+        # Make sure to release worker even on error
+        await worker_pool.release_worker(worker)
+        raise e
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    worker_pool = getattr(app.state, 'worker_pool', None)
+    return {
+        "status": "ok",
+        "ready": worker_pool is not None and len(worker_pool.workers) > 0,
+        "num_gpus": worker_pool.num_gpus if worker_pool else 0,
+        "available_workers": worker_pool.available_workers.qsize() if worker_pool else 0
+    }
+@app.get("/stats")
+async def stats():
+    """Get worker pool statistics."""
+    worker_pool = app.state.worker_pool
+    return {
+        "total_workers": len(worker_pool.workers),
+        "available_workers": worker_pool.available_workers.qsize(),
+        "busy_workers": len(worker_pool.workers) - worker_pool.available_workers.qsize(),
+        "workers": [
+            {
+                "gpu_id": w.gpu_id,
+                "device": str(w.device)
+            } for w in worker_pool.workers
+        ]
+    }
+if __name__ == "__main__":
+    import uvicorn
+    print(f"Starting NanoChat Web Server")
+    print(f"Temperature: {args.temperature}, Top-k: {args.top_k}, Max tokens: {args.max_tokens}")
+    uvicorn.run(app, host=args.host, port=args.port)

start.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+set -e
+MODEL_DIR="/app/nanochat_cache/chatsft_checkpoints/d18"
+MODEL_REPO="tventurella/mr_chatterbox_model"
+# Download model checkpoint if not already present
+if [ ! -f "$MODEL_DIR/model_000050.pt" ]; then
+    echo "Downloading model checkpoint..."
+    python -c "
+from huggingface_hub import hf_hub_download
+hf_hub_download('$MODEL_REPO', 'model_000050.pt', local_dir='$MODEL_DIR')
+hf_hub_download('$MODEL_REPO', 'meta_000050.json', local_dir='$MODEL_DIR')
+print('Model downloaded successfully.')
+"
+else
+    echo "Model checkpoint already present."
+fi
+# Start the server
+exec python -m scripts.chat_web \
+    --model-tag d18 \
+    --device-type cpu \
+    --port 7860 \
+    --temperature 0.7 \
+    --top-k 50 \
+    --max-tokens 256

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_wrapper.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+tokenizer_wrapper.py — nanochat-compatible wrapper for the Victorian BPE tokenizer
+nanochat's base_train.py imports:
+    from nanochat.tokenizer import get_tokenizer, get_token_bytes
+This wrapper provides a VictorianTokenizer class that satisfies nanochat's full
+interface, plus get_tokenizer() and get_token_bytes() drop-in replacements.
+Special token mapping:
+    <|endoftext|>  →  bos (document boundary, prepended to every document)
+    <|pad|>        →  pad
+    <human>        →  user_start  (replaces nanochat's <|user_start|>)
+    <victorian>    →  assistant_start  (replaces nanochat's <|assistant_start|>)
+Usage — patch nanochat/tokenizer.py by adding at the bottom:
+    from pathlib import Path
+    import sys
+    sys.path.insert(0, "/path/to/victorian")
+    from tokenizer_wrapper import get_tokenizer, get_token_bytes
+"""
+from pathlib import Path
+import torch
+from tokenizers import Tokenizer
+TOKENIZER_PATH = Path(__file__).parent / "tokenizer.json"
+class VictorianTokenizer:
+    """
+    Wraps our HuggingFace BPE tokenizer to match nanochat's expected interface.
+    """
+    def __init__(self, tokenizer_path: str | Path = TOKENIZER_PATH):
+        self._tok = Tokenizer.from_file(str(tokenizer_path))
+        self._tok.no_padding()
+        self._tok.no_truncation()
+    # ------------------------------------------------------------------
+    # Core nanochat interface (used by dataloader and base_train.py)
+    # ------------------------------------------------------------------
+    def get_vocab_size(self) -> int:
+        return self._tok.get_vocab_size()
+    def get_bos_token_id(self) -> int:
+        """Prepended to every document by nanochat's dataloader."""
+        return self._tok.token_to_id("<|endoftext|>")
+    def encode(
+        self,
+        texts: list[str] | str,
+        prepend: int | str | None = None,
+        append: int | str | None = None,
+        num_threads: int = 4,
+    ) -> list[int] | list[list[int]]:
+        """
+        Encode strings → token ID list(s).
+        Matches nanochat's native tokenizer behaviour exactly:
+          - Single string  → list[int]
+          - List of strings → list[list[int]]
+        prepend/append may be an int token ID or a special-token string
+        (e.g. prepend="<|bos|>"), matching nanochat's _encode_one interface.
+        """
+        single = isinstance(texts, str)
+        if single:
+            texts = [texts]
+        # Resolve string prepend/append to token IDs (e.g. "<|bos|>" → 0)
+        if isinstance(prepend, str):
+            prepend = self.encode_special(prepend)
+        if isinstance(append, str):
+            append = self.encode_special(append)
+        encodings = self._tok.encode_batch(texts, is_pretokenized=False)
+        ids = [enc.ids for enc in encodings]
+        if prepend is not None:
+            ids = [[prepend] + seq for seq in ids]
+        if append is not None:
+            ids = [seq + [append] for seq in ids]
+        # Single string → flat list[int] to match nanochat's native encode()
+        return ids[0] if single else ids
+    def decode(self, ids: list[int]) -> str:
+        return self._tok.decode(ids)
+    # ------------------------------------------------------------------
+    # Special token accessors
+    # ------------------------------------------------------------------
+    def encode_special(self, token: str) -> int | None:
+        """
+        Look up a special token ID by exact match.
+        Maps nanochat's native special tokens to Victorian equivalents where needed.
+        Required by nanochat's engine.py for sample generation.
+        """
+        # Try exact match first (covers our own special tokens)
+        result = self._tok.token_to_id(token)
+        if result is not None:
+            return result
+        # Map nanochat's native chat tokens to Victorian equivalents
+        _map = {
+            "<|assistant_start|>": "<victorian>",
+            "<|assistant_end|>":   "<|endoftext|>",
+            "<|user_start|>":      "<human>",
+            "<|user_end|>":        "<|endoftext|>",
+            "<|bos|>":             "<|endoftext|>",
+            "<|eos|>":             "<|endoftext|>",
+        }
+        mapped = _map.get(token)
+        if mapped:
+            return self._tok.token_to_id(mapped)
+        return None
+    def get_pad_token_id(self) -> int:
+        return self._tok.token_to_id("<|pad|>")
+    def get_user_start_id(self) -> int:
+        """Maps to nanochat's <|user_start|> role."""
+        return self._tok.token_to_id("<human>")
+    def get_assistant_start_id(self) -> int:
+        """Maps to nanochat's <|assistant_start|> role."""
+        return self._tok.token_to_id("<victorian>")
+    # ------------------------------------------------------------------
+    # Chat / fine-tuning interface (used by chat_sft.py)
+    # ------------------------------------------------------------------
+    def render_conversation(
+        self,
+        conversation: list[dict],
+        max_tokens: int = 2048,
+    ) -> tuple[list[int], list[int]]:
+        """
+        Encode a conversation into token IDs and a loss mask.
+        conversation: list of {"role": "user"|"assistant", "content": str}
+        Returns: (token_ids, loss_mask)  — loss_mask is 1 for assistant tokens, 0 otherwise.
+        Victorian mapping:
+            "user"      → <human> ...
+            "assistant" → <victorian> ... <|endoftext|>  (end token trains model to stop)
+        """
+        human_id     = self.get_user_start_id()
+        victorian_id = self.get_assistant_start_id()
+        bos_id       = self.get_bos_token_id()
+        tokens: list[int] = [bos_id]
+        mask:   list[int] = [0]
+        for turn in conversation:
+            role    = turn["role"]
+            content = turn["content"]
+            content_ids = self.encode(content)
+            if role == "user":
+                turn_tokens = [human_id] + content_ids
+                turn_mask   = [0] * len(turn_tokens)
+            else:  # assistant
+                turn_tokens = [victorian_id] + content_ids + [bos_id]
+                turn_mask   = [1] * len(turn_tokens)
+            tokens.extend(turn_tokens)
+            mask.extend(turn_mask)
+            if len(tokens) >= max_tokens:
+                tokens = tokens[:max_tokens]
+                mask   = mask[:max_tokens]
+                break
+        return tokens, mask
+    # ------------------------------------------------------------------
+    def __call__(self, texts, **kwargs):
+        """Allow tokenizer(texts, ...) as an alias for encode() — required by nanochat's core_eval."""
+        return self.encode(texts, **kwargs)
+    @property
+    def vocab_size(self) -> int:
+        return self.get_vocab_size()
+    def __repr__(self) -> str:
+        return (
+            f"VictorianTokenizer(vocab_size={self.vocab_size}, "
+            f"bos={self.get_bos_token_id()}, "
+            f"human={self.get_user_start_id()}, "
+            f"victorian={self.get_assistant_start_id()})"
+        )
+# ---------------------------------------------------------------------------
+# nanochat drop-in functions
+# ---------------------------------------------------------------------------
+_tokenizer_singleton: VictorianTokenizer | None = None
+def get_tokenizer(tokenizer_path: str | Path = TOKENIZER_PATH) -> VictorianTokenizer:
+    """Drop-in replacement for nanochat's get_tokenizer()."""
+    global _tokenizer_singleton
+    if _tokenizer_singleton is None:
+        _tokenizer_singleton = VictorianTokenizer(tokenizer_path)
+    return _tokenizer_singleton
+def get_token_bytes(device: str | torch.device = "cpu") -> torch.Tensor:
+    """
+    Drop-in replacement for nanochat's get_token_bytes().
+    Returns a 1D tensor of shape [vocab_size] where each entry is the
+    UTF-8 byte length of that token. Used by base_train.py to convert
+    loss from nats/token → bits/byte (the BPB evaluation metric).
+    """
+    tok = get_tokenizer()
+    vocab = tok._tok.get_vocab()  # {token_str: id}
+    vocab_size = tok.get_vocab_size()
+    # Build id → token string mapping
+    id_to_token = {v: k for k, v in vocab.items()}
+    byte_lengths = []
+    for i in range(vocab_size):
+        token_str = id_to_token.get(i, "")
+        # ByteLevel BPE: Ġ represents a leading space (0x20).
+        # Decode the display string back to actual bytes for a correct byte count.
+        try:
+            # Replace Ġ with space, then encode to UTF-8
+            actual = token_str.replace("Ġ", " ").replace("Ċ", "\n").replace("ĉ", "\t")
+            n_bytes = len(actual.encode("utf-8"))
+        except Exception:
+            n_bytes = 1
+        byte_lengths.append(max(1, n_bytes))  # floor at 1 to avoid div-by-zero
+    return torch.tensor(byte_lengths, dtype=torch.long, device=device)
+# ---------------------------------------------------------------------------
+# Sanity check
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import sys
+    if not TOKENIZER_PATH.exists():
+        print(f"Tokenizer not found at {TOKENIZER_PATH}")
+        sys.exit(1)
+    tok = get_tokenizer()
+    print(tok)
+    print(f"  pad={tok.get_pad_token_id()}")
+    texts = [
+        "It is a truth universally acknowledged.",
+        "The phrenological examination was most illuminating, dear fellow.",
+    ]
+    ids = tok.encode(texts, prepend=tok.get_bos_token_id())
+    for text, seq in zip(texts, ids):
+        decoded = tok.decode(seq[1:])
+        ok = "✓" if decoded == text else "✗"
+        print(f"  {ok}  {len(seq):3d} tokens  {text!r}")
+    # Test render_conversation
+    conv = [
+        {"role": "user",      "content": "What is your opinion on the railways?"},
+        {"role": "assistant", "content": "The railways are a most alarming development, yet undeniably useful."},
+    ]
+    token_ids, loss_mask = tok.render_conversation(conv)
+    print(f"\n  render_conversation: {len(token_ids)} tokens, "
+          f"{sum(loss_mask)} assistant tokens in loss mask")
+    # Test get_token_bytes
+    tb = get_token_bytes()
+    print(f"\n  get_token_bytes: shape={tuple(tb.shape)}, "
+          f"mean={tb.mean():.2f} bytes/token, "
+          f"min={tb.min():.0f}, max={tb.max():.0f}")