Spaces:

k050506koch
/

gpt3-dev-api

Running

App Files Files Community

Kyryll Kochkin commited on Feb 6

Commit

88c0e85

1 Parent(s): 0207551

AI added tests

Browse files

Files changed (9) hide show

tests/test_core_helpers.py +94 -0
tests/test_live_api.py +26 -9
tests/test_live_more_models.py +22 -7
tests/test_main_behavior.py +80 -0
tests/test_model_registry.py +58 -0
tests/test_openai_compat.py +0 -6
tests/test_router_error_paths.py +177 -0
tests/test_settings.py +29 -0
tests/test_streaming_contracts.py +225 -0

tests/test_core_helpers.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Unit tests for prompt/token/engine helper utilities."""
+from __future__ import annotations
+import types
+from app.core import engine, tokens
+from app.core.prompting import DEFAULT_SYSTEM_PROMPT, render_chat_prompt
+from app.schemas.chat import ChatMessage
+class DummyTokenizer:
+    def __init__(self) -> None:
+        self.called_with: tuple[str, bool] | None = None
+    def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
+        self.called_with = (text, add_special_tokens)
+        return [1, 2, 3]
+class DummyEncoding:
+    def __init__(self, size: int) -> None:
+        self._size = size
+    def encode(self, _: str) -> list[int]:
+        return list(range(self._size))
+class DummyTikToken:
+    def __init__(self, size: int) -> None:
+        self._size = size
+    def encoding_for_model(self, _: str) -> DummyEncoding:
+        return DummyEncoding(self._size)
+def test_render_chat_prompt_uses_default_system_prompt() -> None:
+    prompt = render_chat_prompt([ChatMessage(role="user", content="Hello")])
+    assert prompt.startswith(f"System: {DEFAULT_SYSTEM_PROMPT}\n\n")
+    assert prompt.endswith("Assistant:")
+    assert "User: Hello" in prompt
+def test_render_chat_prompt_overrides_system_prompt_when_present() -> None:
+    prompt = render_chat_prompt(
+        [
+            ChatMessage(role="system", content="Custom system"),
+            ChatMessage(role="user", content="Hello"),
+            ChatMessage(role="assistant", content="Hi"),
+        ]
+    )
+    assert prompt.startswith("System: Custom system\n\n")
+    assert "User: Hello" in prompt
+    assert "Assistant: Hi" in prompt
+def test_count_tokens_returns_zero_for_empty_text() -> None:
+    assert tokens.count_tokens("", "GPT3-dev") == 0
+def test_count_tokens_uses_tiktoken_when_available(monkeypatch) -> None:
+    monkeypatch.setattr(tokens, "tiktoken", DummyTikToken(size=4))
+    assert tokens.count_tokens("hello", "GPT3-dev") == 4
+def test_count_tokens_falls_back_to_tokenizer_encode(monkeypatch) -> None:
+    monkeypatch.setattr(tokens, "tiktoken", None)
+    tokenizer = DummyTokenizer()
+    assert tokens.count_tokens("hello", "GPT3-dev", tokenizer=tokenizer) == 3
+    assert tokenizer.called_with == ("hello", False)
+def test_apply_stop_sequences_returns_earliest_stop_index() -> None:
+    text, reason = engine._apply_stop_sequences(
+        "abc<END>xyz<STOP>",
+        ["<STOP>", "<END>"],
+    )
+    assert text == "abc"
+    assert reason == "stop"
+def test_normalize_stop_handles_none_string_and_iterable() -> None:
+    assert engine._normalize_stop(None) == ()
+    assert engine._normalize_stop("stop") == ("stop",)
+    assert engine._normalize_stop(["a", "b"]) == ("a", "b")
+def test_pad_token_id_prefers_pad_then_eos_then_zero() -> None:
+    with_pad = types.SimpleNamespace(pad_token_id=9, eos_token_id=7)
+    with_eos_only = types.SimpleNamespace(pad_token_id=None, eos_token_id=7)
+    with_none = types.SimpleNamespace(pad_token_id=None, eos_token_id=None)
+    assert engine._pad_token_id_or_default(with_pad) == 9
+    assert engine._pad_token_id_or_default(with_eos_only) == 7
+    assert engine._pad_token_id_or_default(with_none) == 0

tests/test_live_api.py CHANGED Viewed

@@ -1,28 +1,45 @@
 """Live API smoke tests hitting a running server.
 Skipped by default; set RUN_LIVE_API_TESTS=1 to enable.
-Configure API base via API_BASE_URL (default: http://localhost:5001).
 """
 from __future__ import annotations
 import os
-from typing import List, Set
 import pytest
 import httpx
 RUN_LIVE = os.environ.get("RUN_LIVE_API_TESTS") == "1"
-BASE_URL = os.environ.get("API_BASE_URL", "http://localhost:5001")
 PROMPT = "he is a doctor. His main goal is"
 def _get_models(timeout: float = 10.0) -> Set[str]:
-    with httpx.Client(timeout=timeout) as client:
-        resp = client.get(f"{BASE_URL}/v1/models")
-        resp.raise_for_status()
-        data = resp.json()
-        return {item["id"] for item in data.get("data", [])}
 @pytest.mark.skipif(not RUN_LIVE, reason="set RUN_LIVE_API_TESTS=1 to run live API tests")
@@ -53,7 +70,7 @@ def test_completion_basic(model: str) -> None:
     }
     # Allow generous timeout for first-run weight downloads
     timeout = httpx.Timeout(connect=10.0, read=600.0, write=30.0, pool=10.0)
-    with httpx.Client(timeout=timeout) as client:
         resp = client.post(f"{BASE_URL}/v1/completions", json=payload)
         resp.raise_for_status()
         body = resp.json()

 """Live API smoke tests hitting a running server.
 Skipped by default; set RUN_LIVE_API_TESTS=1 to enable.
+Configure API base via API_BASE_URL (default: https://k050506koch-gpt3-dev-api.hf.space).
 """
 from __future__ import annotations
 import os
+from typing import Set
 import pytest
 import httpx
+DEFAULT_BASE_URL = "https://k050506koch-gpt3-dev-api.hf.space"
+def _normalize_base_url(raw_base_url: str) -> str:
+    base_url = raw_base_url.rstrip("/")
+    if base_url.endswith("/v1"):
+        base_url = base_url[:-3]
+    return base_url
 RUN_LIVE = os.environ.get("RUN_LIVE_API_TESTS") == "1"
+BASE_URL = _normalize_base_url(os.environ.get("API_BASE_URL", DEFAULT_BASE_URL))
+VERIFY_SSL = os.environ.get("API_VERIFY_SSL", "1") != "0"
 PROMPT = "he is a doctor. His main goal is"
 def _get_models(timeout: float = 10.0) -> Set[str]:
+    try:
+        with httpx.Client(timeout=timeout, verify=VERIFY_SSL) as client:
+            resp = client.get(f"{BASE_URL}/v1/models")
+            resp.raise_for_status()
+            data = resp.json()
+            return {item["id"] for item in data.get("data", [])}
+    except httpx.HTTPError as exc:
+        pytest.fail(
+            f"Unable to reach live API at {BASE_URL}/v1/models: {exc}. "
+            "Set API_BASE_URL to your server root URL (with or without '/v1')."
+        )
 @pytest.mark.skipif(not RUN_LIVE, reason="set RUN_LIVE_API_TESTS=1 to run live API tests")
     }
     # Allow generous timeout for first-run weight downloads
     timeout = httpx.Timeout(connect=10.0, read=600.0, write=30.0, pool=10.0)
+    with httpx.Client(timeout=timeout, verify=VERIFY_SSL) as client:
         resp = client.post(f"{BASE_URL}/v1/completions", json=payload)
         resp.raise_for_status()
         body = resp.json()

tests/test_live_more_models.py CHANGED Viewed

@@ -15,8 +15,18 @@ import pytest
 import httpx
 RUN_LIVE = os.environ.get("RUN_LIVE_API_TESTS") == "1"
-BASE_URL = os.environ.get("API_BASE_URL", "http://localhost:5001")
 VERIFY_SSL = os.environ.get("API_VERIFY_SSL", "1") != "0"
 PROMPT = "he is a doctor. His main goal is"
@@ -35,11 +45,17 @@ CANDIDATES = [
 @lru_cache(maxsize=1)
 def _get_models(timeout: float = 10.0) -> Set[str]:
-    with httpx.Client(timeout=timeout, verify=VERIFY_SSL) as client:
-        resp = client.get(f"{BASE_URL}/v1/models")
-        resp.raise_for_status()
-        data = resp.json()
-        models = {item.get("id") for item in (data.get("data") or [])}
     if not models:
         pytest.fail(f"/v1/models returned no data from {BASE_URL}")
@@ -95,4 +111,3 @@ def test_completion_for_models(model: str) -> None:
     warnings.warn(message, stacklevel=1)
     usage = body.get("usage") or {}
     assert "total_tokens" in usage

 import httpx
+DEFAULT_BASE_URL = "https://k050506koch-gpt3-dev-api.hf.space"
+def _normalize_base_url(raw_base_url: str) -> str:
+    base_url = raw_base_url.rstrip("/")
+    if base_url.endswith("/v1"):
+        base_url = base_url[:-3]
+    return base_url
 RUN_LIVE = os.environ.get("RUN_LIVE_API_TESTS") == "1"
+BASE_URL = _normalize_base_url(os.environ.get("API_BASE_URL", DEFAULT_BASE_URL))
 VERIFY_SSL = os.environ.get("API_VERIFY_SSL", "1") != "0"
 PROMPT = "he is a doctor. His main goal is"
 @lru_cache(maxsize=1)
 def _get_models(timeout: float = 10.0) -> Set[str]:
+    try:
+        with httpx.Client(timeout=timeout, verify=VERIFY_SSL) as client:
+            resp = client.get(f"{BASE_URL}/v1/models")
+            resp.raise_for_status()
+            data = resp.json()
+            models = {item.get("id") for item in (data.get("data") or [])}
+    except httpx.HTTPError as exc:
+        pytest.fail(
+            f"Unable to reach live API at {BASE_URL}/v1/models: {exc}. "
+            "Set API_BASE_URL to your server root URL (with or without '/v1')."
+        )
     if not models:
         pytest.fail(f"/v1/models returned no data from {BASE_URL}")
     warnings.warn(message, stacklevel=1)
     usage = body.get("usage") or {}
     assert "total_tokens" in usage

tests/test_main_behavior.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Behavioral tests for app.main helpers and handlers."""
+from __future__ import annotations
+import asyncio
+import json
+from fastapi import HTTPException
+from starlette.requests import Request
+from app import main
+def _dummy_request() -> Request:
+    scope = {
+        "type": "http",
+        "http_version": "1.1",
+        "method": "GET",
+        "path": "/",
+        "headers": [],
+        "query_string": b"",
+    }
+    return Request(scope)
+def test_root_returns_ok_when_no_failures(monkeypatch) -> None:
+    monkeypatch.setattr(main, "_endpoint_status", {"failures": {}, "last_checked": None})
+    payload = asyncio.run(main.root())
+    assert payload == {"status": "ok", "message": "GPT3dev API is running"}
+def test_root_returns_degraded_with_sorted_issues_and_last_checked(monkeypatch) -> None:
+    monkeypatch.setattr(
+        main,
+        "_endpoint_status",
+        {
+            "failures": {
+                "/v1/zeta": {"status_code": 503},
+                "/v1/alpha": {"status_code": 500, "detail": "boom"},
+            },
+            "last_checked": "2026-02-05T12:00:00+00:00",
+        },
+    )
+    payload = asyncio.run(main.root())
+    assert payload["status"] == "degraded"
+    assert payload["message"] == "GPT3dev API is running"
+    assert payload["issues"][0]["endpoint"] == "/v1/alpha"
+    assert payload["issues"][1]["endpoint"] == "/v1/zeta"
+    assert payload["last_checked"] == "2026-02-05T12:00:00+00:00"
+def test_openai_exception_handler_wraps_error_payload() -> None:
+    exc = HTTPException(
+        status_code=400,
+        detail={
+            "message": "bad request",
+            "type": "invalid_request_error",
+            "param": "model",
+            "code": "bad_model",
+        },
+    )
+    response = asyncio.run(main.openai_http_exception_handler(_dummy_request(), exc))
+    assert response.status_code == 400
+    assert json.loads(response.body.decode("utf-8")) == {
+        "error": {
+            "message": "bad request",
+            "type": "invalid_request_error",
+            "param": "model",
+            "code": "bad_model",
+        }
+    }
+def test_openai_exception_handler_preserves_generic_detail() -> None:
+    exc = HTTPException(status_code=422, detail="unprocessable")
+    response = asyncio.run(main.openai_http_exception_handler(_dummy_request(), exc))
+    assert response.status_code == 422
+    assert json.loads(response.body.decode("utf-8")) == {"detail": "unprocessable"}

tests/test_model_registry.py CHANGED Viewed

@@ -115,3 +115,61 @@ def test_custom_registry_can_extend_defaults(reset_registry, tmp_path: Path):
     names = {spec.name for spec in model_registry.list_models()}
     assert "Tiny" in names
     assert "GPT3-dev" in names

     names = {spec.name for spec in model_registry.list_models()}
     assert "Tiny" in names
     assert "GPT3-dev" in names
+def test_registry_loads_yaml_when_json_parse_fails(
+    reset_registry,
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+):
+    registry_path = tmp_path / "registry.yaml"
+    registry_path.write_text("- name: Tiny\n  hf_repo: dummy/tiny\n")
+    def fake_safe_load(data: str) -> list[dict[str, str]]:
+        assert "name: Tiny" in data
+        return [{"name": "Tiny", "hf_repo": "dummy/tiny"}]
+    monkeypatch.setattr(model_registry.yaml, "safe_load", fake_safe_load)
+    reset_registry(registry_path=str(registry_path))
+    names = {spec.name for spec in model_registry.list_models()}
+    assert names == {"Tiny"}
+def test_registry_rejects_non_list_file_payload(reset_registry, tmp_path: Path):
+    registry_path = tmp_path / "registry.json"
+    registry_path.write_text(json.dumps({"name": "Tiny", "hf_repo": "dummy/tiny"}))
+    reset_registry(registry_path=str(registry_path))
+    with pytest.raises(ValueError, match="must contain a list"):
+        model_registry.list_models()
+def test_registry_rejects_non_object_entries(reset_registry, tmp_path: Path):
+    registry_path = tmp_path / "registry.json"
+    registry_path.write_text(json.dumps(["not-an-object"]))
+    reset_registry(registry_path=str(registry_path))
+    with pytest.raises(ValueError, match="entries must be objects"):
+        model_registry.list_models()
+def test_registry_path_missing_raises_file_not_found(reset_registry, tmp_path: Path):
+    reset_registry(registry_path=str(tmp_path / "missing.json"))
+    with pytest.raises(FileNotFoundError, match="MODEL_REGISTRY_PATH not found"):
+        model_registry.list_models()
+def test_file_registry_overrides_default_model_with_same_name(reset_registry, tmp_path: Path):
+    registry_path = tmp_path / "registry.json"
+    registry_path.write_text(
+        json.dumps([{"name": "GPT3-dev", "hf_repo": "custom/override"}])
+    )
+    reset_registry(registry_path=str(registry_path), include_defaults=True)
+    spec = model_registry.get_model_spec("GPT3-dev")
+    names = {item.name for item in model_registry.list_models()}
+    assert "GPT-2" in names
+    assert spec.hf_repo == "custom/override"

tests/test_openai_compat.py CHANGED Viewed

@@ -302,12 +302,6 @@ def test_responses_instruct_messages(monkeypatch: pytest.MonkeyPatch) -> None:
     assert body["usage"]["total_tokens"] == 4
-def test_openai_client_responses_create(monkeypatch: pytest.MonkeyPatch) -> None:
-    openai_module = pytest.importorskip("openai")
-    OpenAI = openai_module.OpenAI
-    pytest.skip("OpenAI client test moved to live API coverage.")
 def test_embeddings_not_implemented() -> None:
     with pytest.raises(HTTPException) as exc:
         asyncio.run(embeddings.create_embeddings())

     assert body["usage"]["total_tokens"] == 4
 def test_embeddings_not_implemented() -> None:
     with pytest.raises(HTTPException) as exc:
         asyncio.run(embeddings.create_embeddings())

tests/test_router_error_paths.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""Router-level error path tests for OpenAI-compatible payloads."""
+from __future__ import annotations
+import asyncio
+import pytest
+from fastapi import HTTPException
+from app.core.model_registry import ModelSpec
+from app.routers import chat, completions, embeddings, responses
+from app.schemas.chat import ChatCompletionRequest
+from app.schemas.completions import CompletionRequest
+from app.schemas.responses import ResponseRequest
+def _raise_key_error(_: str) -> None:
+    raise KeyError("unknown")
+def test_completions_unknown_model_returns_404_openai_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr("app.routers.completions.get_model_spec", _raise_key_error)
+    payload = CompletionRequest.model_validate({"model": "missing", "prompt": "Hi"})
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(completions.create_completion(payload))
+    assert exc.value.status_code == 404
+    assert exc.value.detail["type"] == "model_not_found"
+    assert exc.value.detail["param"] == "model"
+    assert exc.value.detail["code"] == "model_not_found"
+def test_chat_unknown_model_returns_404_openai_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr("app.routers.chat.get_model_spec", _raise_key_error)
+    payload = ChatCompletionRequest.model_validate(
+        {
+            "model": "missing",
+            "messages": [{"role": "user", "content": "Hi"}],
+        }
+    )
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(chat.create_chat_completion(payload))
+    assert exc.value.status_code == 404
+    assert exc.value.detail["type"] == "model_not_found"
+    assert exc.value.detail["param"] == "model"
+    assert exc.value.detail["code"] == "model_not_found"
+def test_responses_unknown_model_returns_404_openai_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr("app.routers.responses.get_model_spec", _raise_key_error)
+    payload = ResponseRequest.model_validate({"model": "missing", "input": "Hi"})
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(responses.create_response(payload))
+    assert exc.value.status_code == 404
+    assert exc.value.detail["type"] == "model_not_found"
+    assert exc.value.detail["param"] == "model"
+    assert exc.value.detail["code"] == "model_not_found"
+def test_completions_generation_exception_returns_generation_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    def boom(*_: object, **__: object) -> None:
+        raise RuntimeError("boom")
+    monkeypatch.setattr("app.routers.completions.get_model_spec", lambda _: None)
+    monkeypatch.setattr("app.routers.completions.engine.generate", boom)
+    payload = CompletionRequest.model_validate({"model": "GPT3-dev", "prompt": "Hi"})
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(completions.create_completion(payload))
+    assert exc.value.status_code == 500
+    assert exc.value.detail["type"] == "server_error"
+    assert exc.value.detail["code"] == "generation_error"
+    assert "Generation error:" in exc.value.detail["message"]
+def test_chat_generation_exception_returns_generation_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    def boom(*_: object, **__: object) -> None:
+        raise RuntimeError("boom")
+    monkeypatch.setattr(
+        "app.routers.chat.get_model_spec",
+        lambda model: ModelSpec(name=model, hf_repo="dummy/instruct", is_instruct=True),
+    )
+    monkeypatch.setattr("app.routers.chat.engine.apply_chat_template", lambda *_: "prompt")
+    monkeypatch.setattr("app.routers.chat.engine.generate", boom)
+    payload = ChatCompletionRequest.model_validate(
+        {
+            "model": "GPT4-dev-177M-1511-Instruct",
+            "messages": [{"role": "user", "content": "Hi"}],
+        }
+    )
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(chat.create_chat_completion(payload))
+    assert exc.value.status_code == 500
+    assert exc.value.detail["type"] == "server_error"
+    assert exc.value.detail["code"] == "generation_error"
+    assert "Generation error:" in exc.value.detail["message"]
+def test_responses_generation_exception_returns_generation_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    def boom(*_: object, **__: object) -> None:
+        raise RuntimeError("boom")
+    monkeypatch.setattr(
+        "app.routers.responses.get_model_spec",
+        lambda model: ModelSpec(name=model, hf_repo="dummy/base", is_instruct=False),
+    )
+    monkeypatch.setattr("app.routers.responses.engine.generate", boom)
+    payload = ResponseRequest.model_validate({"model": "GPT3-dev", "input": "Hi"})
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(responses.create_response(payload))
+    assert exc.value.status_code == 500
+    assert exc.value.detail["type"] == "server_error"
+    assert exc.value.detail["code"] == "generation_error"
+    assert "Generation error:" in exc.value.detail["message"]
+def test_responses_structured_input_with_non_instruct_model_returns_400(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        "app.routers.responses.get_model_spec",
+        lambda model: ModelSpec(name=model, hf_repo="dummy/base", is_instruct=False),
+    )
+    payload = ResponseRequest.model_validate(
+        {
+            "model": "GPT3-dev",
+            "input": [{"role": "user", "content": "Hi"}],
+        }
+    )
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(responses.create_response(payload))
+    assert exc.value.status_code == 400
+    assert exc.value.detail["type"] == "invalid_request_error"
+    assert exc.value.detail["param"] == "model"
+    assert "not an instruct model" in exc.value.detail["message"]
+def test_embeddings_enabled_backend_returns_pending_code(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    class DummySettings:
+        enable_embeddings_backend = True
+    monkeypatch.setattr("app.routers.embeddings.get_settings", lambda: DummySettings())
+    with pytest.raises(HTTPException) as exc:
+        asyncio.run(embeddings.create_embeddings())
+    assert exc.value.status_code == 501
+    assert exc.value.detail["type"] == "not_implemented_error"
+    assert exc.value.detail["code"] == "embeddings_backend_pending"

tests/test_settings.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Tests for environment-driven settings parsing validators."""
+from __future__ import annotations
+import pytest
+from pydantic import ValidationError
+from app.core.settings import Settings
+def test_cors_allow_origins_parses_comma_separated_values() -> None:
+    settings = Settings.model_validate(
+        {"CORS_ALLOW_ORIGINS": "https://a.example, https://b.example"}
+    )
+    assert settings.cors_allow_origins == ["https://a.example", "https://b.example"]
+def test_cors_allow_origins_rejects_invalid_type() -> None:
+    with pytest.raises(ValidationError):
+        Settings.model_validate({"CORS_ALLOW_ORIGINS": 123})
+def test_model_allow_list_parses_comma_separated_values() -> None:
+    settings = Settings.model_validate({"MODEL_ALLOW_LIST": "GPT3-dev, GPT-2"})
+    assert settings.model_allow_list == ["GPT3-dev", "GPT-2"]
+def test_model_allow_list_rejects_invalid_type() -> None:
+    with pytest.raises(ValidationError):
+        Settings.model_validate({"MODEL_ALLOW_LIST": 123})

tests/test_streaming_contracts.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""Streaming contract tests for OpenAI-compatible SSE endpoints."""
+from __future__ import annotations
+import asyncio
+import json
+from collections import deque
+import pytest
+from fastapi.responses import StreamingResponse
+from app.core.model_registry import ModelSpec
+from app.routers import chat, completions, responses
+from app.schemas.chat import ChatCompletionRequest
+from app.schemas.completions import CompletionRequest
+from app.schemas.responses import ResponseRequest
+class DummyStream:
+    def __init__(
+        self,
+        *,
+        tokens: list[str],
+        prompt_tokens: int,
+        completion_tokens: int,
+        finish_reason: str = "stop",
+    ) -> None:
+        self._tokens = tokens
+        self.prompt_tokens = prompt_tokens
+        self.completion_tokens = completion_tokens
+        self.finish_reason = finish_reason
+    def iter_tokens(self):
+        for token in self._tokens:
+            yield token
+async def _read_stream_body(response: StreamingResponse) -> str:
+    chunks: list[str] = []
+    async for chunk in response.body_iterator:
+        if isinstance(chunk, bytes):
+            chunks.append(chunk.decode("utf-8"))
+        else:
+            chunks.append(chunk)
+    return "".join(chunks)
+def _parse_sse_data_frames(raw_body: str) -> list[str]:
+    frames = [frame.strip() for frame in raw_body.split("\n\n") if frame.strip()]
+    data_frames: list[str] = []
+    for frame in frames:
+        assert frame.startswith("data: ")
+        data_frames.append(frame[len("data: ") :])
+    return data_frames
+def test_completions_stream_emits_sse_chunks_usage_and_done(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr("app.routers.completions.get_model_spec", lambda _: None)
+    monkeypatch.setattr(
+        "app.routers.completions.engine.create_stream",
+        lambda *_, **__: DummyStream(
+            tokens=["Hel", "lo"],
+            prompt_tokens=3,
+            completion_tokens=2,
+            finish_reason="stop",
+        ),
+    )
+    payload = CompletionRequest.model_validate(
+        {
+            "model": "GPT3-dev",
+            "prompt": "Hello",
+            "stream": True,
+        }
+    )
+    response = asyncio.run(completions.create_completion(payload))
+    assert isinstance(response, StreamingResponse)
+    body = asyncio.run(_read_stream_body(response))
+    data_frames = _parse_sse_data_frames(body)
+    assert data_frames[-1] == "[DONE]"
+    chunks = [json.loads(frame) for frame in data_frames[:-1]]
+    assert chunks[0]["object"] == "text_completion.chunk"
+    assert chunks[0]["choices"][0]["text"] == "Hel"
+    assert chunks[1]["choices"][0]["text"] == "lo"
+    assert chunks[2]["choices"][0]["finish_reason"] == "stop"
+    tail = chunks[-1]
+    assert tail["choices"] == []
+    assert tail["usage"] == {
+        "prompt_tokens": 3,
+        "completion_tokens": 2,
+        "total_tokens": 5,
+    }
+def test_chat_stream_emits_initial_role_delta_and_done(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        "app.routers.chat.get_model_spec",
+        lambda model: ModelSpec(name=model, hf_repo="dummy/instruct", is_instruct=True),
+    )
+    monkeypatch.setattr("app.routers.chat.engine.apply_chat_template", lambda *_: "formatted")
+    monkeypatch.setattr(
+        "app.routers.chat.engine.create_stream",
+        lambda *_, **__: DummyStream(
+            tokens=["Hi", " there"],
+            prompt_tokens=4,
+            completion_tokens=2,
+            finish_reason="stop",
+        ),
+    )
+    payload = ChatCompletionRequest.model_validate(
+        {
+            "model": "GPT4-dev-177M-1511-Instruct",
+            "messages": [{"role": "user", "content": "hello"}],
+            "stream": True,
+        }
+    )
+    response = asyncio.run(chat.create_chat_completion(payload))
+    assert isinstance(response, StreamingResponse)
+    body = asyncio.run(_read_stream_body(response))
+    data_frames = _parse_sse_data_frames(body)
+    assert data_frames[-1] == "[DONE]"
+    chunks = [json.loads(frame) for frame in data_frames[:-1]]
+    assert chunks[0]["choices"][0]["delta"]["role"] == "assistant"
+    assert chunks[1]["choices"][0]["delta"]["content"] == "Hi"
+    assert chunks[2]["choices"][0]["delta"]["content"] == " there"
+    assert chunks[3]["choices"][0]["finish_reason"] == "stop"
+def test_responses_stream_emits_created_delta_completed_done(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        "app.routers.responses.get_model_spec",
+        lambda model: ModelSpec(name=model, hf_repo="dummy/base", is_instruct=False),
+    )
+    monkeypatch.setattr(
+        "app.routers.responses.engine.create_stream",
+        lambda *_, **__: DummyStream(
+            tokens=["Hi", " there"],
+            prompt_tokens=5,
+            completion_tokens=2,
+            finish_reason="stop",
+        ),
+    )
+    payload = ResponseRequest.model_validate(
+        {
+            "model": "GPT3-dev",
+            "input": "Say hi",
+            "stream": True,
+        }
+    )
+    response = asyncio.run(responses.create_response(payload))
+    assert isinstance(response, StreamingResponse)
+    body = asyncio.run(_read_stream_body(response))
+    data_frames = _parse_sse_data_frames(body)
+    assert data_frames[-1] == "[DONE]"
+    events = [json.loads(frame) for frame in data_frames[:-1]]
+    assert events[0]["type"] == "response.created"
+    assert events[1]["type"] == "response.output_text.delta"
+    assert events[1]["delta"] == "Hi"
+    assert events[2]["type"] == "response.output_text.delta"
+    assert events[2]["delta"] == " there"
+    assert events[3]["type"] == "response.completed"
+    assert events[3]["response"]["output"][0]["content"][0]["text"] == "Hi there"
+    assert events[3]["response"]["usage"] == {
+        "input_tokens": 5,
+        "output_tokens": 2,
+        "total_tokens": 7,
+    }
+def test_completions_stream_usage_aggregates_prompt_and_completion_tokens(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    calls: list[str] = []
+    streams = deque(
+        [
+            DummyStream(tokens=["a1"], prompt_tokens=10, completion_tokens=1),
+            DummyStream(tokens=["a2"], prompt_tokens=999, completion_tokens=2),
+            DummyStream(tokens=["b1"], prompt_tokens=20, completion_tokens=3),
+            DummyStream(tokens=["b2"], prompt_tokens=888, completion_tokens=4),
+        ]
+    )
+    def fake_create_stream(model: str, prompt: str, **_: object) -> DummyStream:
+        calls.append(prompt)
+        return streams.popleft()
+    monkeypatch.setattr("app.routers.completions.get_model_spec", lambda _: None)
+    monkeypatch.setattr("app.routers.completions.engine.create_stream", fake_create_stream)
+    payload = CompletionRequest.model_validate(
+        {
+            "model": "GPT3-dev",
+            "prompt": ["alpha", "beta"],
+            "n": 2,
+            "stream": True,
+        }
+    )
+    response = asyncio.run(completions.create_completion(payload))
+    body = asyncio.run(_read_stream_body(response))
+    data_frames = _parse_sse_data_frames(body)
+    assert data_frames[-1] == "[DONE]"
+    chunks = [json.loads(frame) for frame in data_frames[:-1]]
+    tail = chunks[-1]
+    assert calls == ["alpha", "alpha", "beta", "beta"]
+    assert tail["usage"] == {
+        "prompt_tokens": 30,
+        "completion_tokens": 10,
+        "total_tokens": 40,
+    }