tests/test_integration.py

"""Integration tests — FastAPI app via httpx.AsyncClient test transport.

All external I/O (Ollama, Qdrant) is mocked. No live services required.
"""
import pytest
import json
import os
from unittest.mock import AsyncMock, MagicMock, patch
from pathlib import Path


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _make_mock_qdrant():
    """Return a fully-mocked QdrantService."""
    mock = MagicMock()
    mock._ensure_collection = AsyncMock()
    mock.semantic_search = AsyncMock(return_value=[])
    mock.get_recent_turns = AsyncMock(return_value=[])
    mock.store_qa_turn = AsyncMock(return_value="fake-uuid")
    mock.close = AsyncMock()
    return mock


def _ollama_tags_response():
    return {"models": [{"name": "llama3", "size": 0}]}


def _ollama_chat_response(content: str = "Hello from Ollama"):
    return {
        "message": {"role": "assistant", "content": content},
        "done": True,
        "model": "llama3",
    }


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

@pytest.fixture()
def mock_qdrant():
    return _make_mock_qdrant()


@pytest.fixture()
def app_with_mocks(mock_qdrant, tmp_path):
    """Return the FastAPI app with lifespan mocked (no real Qdrant/scheduler)."""
    from contextlib import asynccontextmanager

    # Minimal curator prompt
    prompts_dir = tmp_path / "prompts"
    prompts_dir.mkdir()
    (prompts_dir / "curator_prompt.md").write_text("Curate. Date: {CURRENT_DATE}")
    (prompts_dir / "systemprompt.md").write_text("You are Vera.")

    @asynccontextmanager
    async def fake_lifespan(app):
        yield

    import app.main as main_module

    with patch.dict(os.environ, {"VERA_PROMPTS_DIR": str(prompts_dir)}), \
         patch("app.main.get_qdrant_service", return_value=mock_qdrant), \
         patch("app.singleton.get_qdrant_service", return_value=mock_qdrant), \
         patch("app.main.Curator") as MockCurator, \
         patch("app.main.scheduler") as mock_scheduler:

        mock_scheduler.add_job = MagicMock()
        mock_scheduler.start = MagicMock()
        mock_scheduler.shutdown = MagicMock()

        mock_curator_instance = MagicMock()
        mock_curator_instance.run = AsyncMock()
        MockCurator.return_value = mock_curator_instance

        from fastapi import FastAPI
        from fastapi.testclient import TestClient

        # Import fresh — use the real routes but swap lifespan
        from app.main import app as vera_app
        vera_app.router.lifespan_context = fake_lifespan

        yield vera_app, mock_qdrant


# ---------------------------------------------------------------------------
# Health check
# ---------------------------------------------------------------------------

class TestHealthCheck:
    def test_health_ollama_reachable(self, app_with_mocks):
        """GET / returns status ok and ollama=reachable when Ollama is up."""
        from fastapi.testclient import TestClient

        vera_app, mock_qdrant = app_with_mocks

        mock_resp = MagicMock()
        mock_resp.status_code = 200

        mock_client_instance = AsyncMock()
        mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)
        mock_client_instance.__aexit__ = AsyncMock(return_value=False)
        mock_client_instance.get = AsyncMock(return_value=mock_resp)

        with patch("httpx.AsyncClient", return_value=mock_client_instance):
            with TestClient(vera_app, raise_server_exceptions=True) as client:
                resp = client.get("/")

        assert resp.status_code == 200
        body = resp.json()
        assert body["status"] == "ok"
        assert body["ollama"] == "reachable"

    def test_health_ollama_unreachable(self, app_with_mocks):
        """GET / returns ollama=unreachable when Ollama is down."""
        import httpx
        from fastapi.testclient import TestClient

        vera_app, _ = app_with_mocks

        mock_client_instance = AsyncMock()
        mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)
        mock_client_instance.__aexit__ = AsyncMock(return_value=False)
        mock_client_instance.get = AsyncMock(side_effect=httpx.ConnectError("refused"))

        with patch("httpx.AsyncClient", return_value=mock_client_instance):
            with TestClient(vera_app, raise_server_exceptions=True) as client:
                resp = client.get("/")

        assert resp.status_code == 200
        assert resp.json()["ollama"] == "unreachable"


# ---------------------------------------------------------------------------
# /api/tags
# ---------------------------------------------------------------------------

class TestApiTags:
    def test_returns_model_list(self, app_with_mocks):
        """GET /api/tags proxies Ollama tags."""
        from fastapi.testclient import TestClient

        vera_app, _ = app_with_mocks

        mock_resp = MagicMock()
        mock_resp.json.return_value = _ollama_tags_response()

        mock_client_instance = AsyncMock()
        mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)
        mock_client_instance.__aexit__ = AsyncMock(return_value=False)
        mock_client_instance.get = AsyncMock(return_value=mock_resp)

        with patch("httpx.AsyncClient", return_value=mock_client_instance):
            with TestClient(vera_app) as client:
                resp = client.get("/api/tags")

        assert resp.status_code == 200
        data = resp.json()
        assert "models" in data
        assert any(m["name"] == "llama3" for m in data["models"])

    def test_cloud_models_injected(self, tmp_path):
        """Cloud models appear in /api/tags when cloud is enabled."""
        from fastapi.testclient import TestClient
        from contextlib import asynccontextmanager

        prompts_dir = tmp_path / "prompts"
        prompts_dir.mkdir()
        (prompts_dir / "curator_prompt.md").write_text("Curate.")
        (prompts_dir / "systemprompt.md").write_text("")

        mock_qdrant = _make_mock_qdrant()

        @asynccontextmanager
        async def fake_lifespan(app):
            yield

        from app.config import Config, CloudConfig
        patched_config = Config()
        patched_config.cloud = CloudConfig(
            enabled=True,
            models={"gpt-oss:120b": "openai/gpt-4o"},
        )

        mock_resp = MagicMock()
        mock_resp.json.return_value = {"models": []}

        mock_client_instance = AsyncMock()
        mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)
        mock_client_instance.__aexit__ = AsyncMock(return_value=False)
        mock_client_instance.get = AsyncMock(return_value=mock_resp)

        import app.main as main_module

        with patch.dict(os.environ, {"VERA_PROMPTS_DIR": str(prompts_dir)}), \
             patch("app.main.config", patched_config), \
             patch("app.main.get_qdrant_service", return_value=mock_qdrant), \
             patch("app.main.scheduler") as mock_scheduler, \
             patch("app.main.Curator") as MockCurator:

            mock_scheduler.add_job = MagicMock()
            mock_scheduler.start = MagicMock()
            mock_scheduler.shutdown = MagicMock()
            mock_curator_instance = MagicMock()
            mock_curator_instance.run = AsyncMock()
            MockCurator.return_value = mock_curator_instance

            from app.main import app as vera_app
            vera_app.router.lifespan_context = fake_lifespan

            with patch("httpx.AsyncClient", return_value=mock_client_instance):
                with TestClient(vera_app) as client:
                    resp = client.get("/api/tags")

        data = resp.json()
        names = [m["name"] for m in data["models"]]
        assert "gpt-oss:120b" in names


# ---------------------------------------------------------------------------
# POST /api/chat (non-streaming)
# ---------------------------------------------------------------------------

class TestApiChatNonStreaming:
    def test_non_streaming_round_trip(self, app_with_mocks):
        """POST /api/chat with stream=False returns Ollama response."""
        from fastapi.testclient import TestClient
        import app.utils as utils_module
        import app.proxy_handler as ph_module

        vera_app, mock_qdrant = app_with_mocks

        ollama_data = _ollama_chat_response("The answer is 42.")

        mock_post_resp = MagicMock()
        mock_post_resp.json.return_value = ollama_data

        mock_client_instance = AsyncMock()
        mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)
        mock_client_instance.__aexit__ = AsyncMock(return_value=False)
        mock_client_instance.post = AsyncMock(return_value=mock_post_resp)

        with patch.object(utils_module, "load_system_prompt", return_value=""), \
             patch.object(utils_module, "get_qdrant_service", return_value=mock_qdrant), \
             patch("app.proxy_handler.get_qdrant_service", return_value=mock_qdrant), \
             patch("httpx.AsyncClient", return_value=mock_client_instance):

            with TestClient(vera_app) as client:
                resp = client.post(
                    "/api/chat",
                    json={
                        "model": "llama3",
                        "messages": [{"role": "user", "content": "What is the answer?"}],
                        "stream": False,
                    },
                )

        assert resp.status_code == 200
        body = resp.json()
        assert body["message"]["content"] == "The answer is 42."

    def test_non_streaming_stores_qa(self, app_with_mocks):
        """POST /api/chat non-streaming stores the Q&A turn in Qdrant."""
        from fastapi.testclient import TestClient
        import app.utils as utils_module

        vera_app, mock_qdrant = app_with_mocks

        ollama_data = _ollama_chat_response("42.")

        mock_post_resp = MagicMock()
        mock_post_resp.json.return_value = ollama_data

        mock_client_instance = AsyncMock()
        mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)
        mock_client_instance.__aexit__ = AsyncMock(return_value=False)
        mock_client_instance.post = AsyncMock(return_value=mock_post_resp)

        with patch.object(utils_module, "load_system_prompt", return_value=""), \
             patch.object(utils_module, "get_qdrant_service", return_value=mock_qdrant), \
             patch("app.proxy_handler.get_qdrant_service", return_value=mock_qdrant), \
             patch("httpx.AsyncClient", return_value=mock_client_instance):

            with TestClient(vera_app) as client:
                client.post(
                    "/api/chat",
                    json={
                        "model": "llama3",
                        "messages": [{"role": "user", "content": "What is 6*7?"}],
                        "stream": False,
                    },
                )

        mock_qdrant.store_qa_turn.assert_called_once()
        args = mock_qdrant.store_qa_turn.call_args[0]
        assert "6*7" in args[0]
        assert "42." in args[1]


# ---------------------------------------------------------------------------
# POST /api/chat (streaming)
# ---------------------------------------------------------------------------

class TestApiChatStreaming:
    def test_streaming_response_passthrough(self, app_with_mocks):
        """POST /api/chat with stream=True streams Ollama chunks."""
        from fastapi.testclient import TestClient
        import app.utils as utils_module
        import app.proxy_handler as ph_module

        vera_app, mock_qdrant = app_with_mocks

        chunk1 = json.dumps({"message": {"content": "Hello"}, "done": False}).encode()
        chunk2 = json.dumps({"message": {"content": " world"}, "done": True}).encode()

        async def fake_aiter_bytes():
            yield chunk1
            yield chunk2

        mock_stream_resp = MagicMock()
        mock_stream_resp.aiter_bytes = fake_aiter_bytes
        mock_stream_resp.status_code = 200
        mock_stream_resp.headers = {"content-type": "application/x-ndjson"}

        mock_client_instance = AsyncMock()
        mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)
        mock_client_instance.__aexit__ = AsyncMock(return_value=False)
        mock_client_instance.post = AsyncMock(return_value=mock_stream_resp)

        with patch.object(utils_module, "load_system_prompt", return_value=""), \
             patch.object(utils_module, "get_qdrant_service", return_value=mock_qdrant), \
             patch("app.proxy_handler.get_qdrant_service", return_value=mock_qdrant), \
             patch("httpx.AsyncClient", return_value=mock_client_instance):

            with TestClient(vera_app) as client:
                resp = client.post(
                    "/api/chat",
                    json={
                        "model": "llama3",
                        "messages": [{"role": "user", "content": "Say hello"}],
                        "stream": True,
                    },
                )

        assert resp.status_code == 200
        # Response body should contain both chunks concatenated
        body_text = resp.text
        assert "Hello" in body_text or len(body_text) > 0
test: expand coverage to 70%+ — add utils, config, curator, proxy, integration tests - Extend test_utils.py: filter_memories_by_time, merge_memories, calculate_token_budget, build_augmented_messages (mocked) - Extend test_config.py: Config.load() with TOML via tmp_path, CloudConfig helpers, env var api_key - Add test_curator.py: _parse_json_response, _is_recent, _format_raw_turns, _append_rule_to_file - Add test_proxy_handler.py: clean_message_content, handle_chat_non_streaming (mocked httpx+qdrant) - Add test_integration.py: health check, /api/tags, /api/chat non-streaming + streaming via TestClient - Add pytest.ini (asyncio_mode=auto), add pytest-cov to requirements.txt Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-31 19:19:49 -05:00			`"""Integration tests — FastAPI app via httpx.AsyncClient test transport.`

			`All external I/O (Ollama, Qdrant) is mocked. No live services required.`
			`"""`
			`import pytest`
			`import json`
			`import os`
			`from unittest.mock import AsyncMock, MagicMock, patch`
			`from pathlib import Path`


			`# ---------------------------------------------------------------------------`
			`# Helpers`
			`# ---------------------------------------------------------------------------`

			`def _make_mock_qdrant():`
			`"""Return a fully-mocked QdrantService."""`
			`mock = MagicMock()`
			`mock._ensure_collection = AsyncMock()`
			`mock.semantic_search = AsyncMock(return_value=[])`
			`mock.get_recent_turns = AsyncMock(return_value=[])`
			`mock.store_qa_turn = AsyncMock(return_value="fake-uuid")`
			`mock.close = AsyncMock()`
			`return mock`


			`def _ollama_tags_response():`
			`return {"models": [{"name": "llama3", "size": 0}]}`


			`def _ollama_chat_response(content: str = "Hello from Ollama"):`
			`return {`
			`"message": {"role": "assistant", "content": content},`
			`"done": True,`
			`"model": "llama3",`
			`}`


			`# ---------------------------------------------------------------------------`
			`# Fixtures`
			`# ---------------------------------------------------------------------------`

			`@pytest.fixture()`
			`def mock_qdrant():`
			`return _make_mock_qdrant()`


			`@pytest.fixture()`
			`def app_with_mocks(mock_qdrant, tmp_path):`
			`"""Return the FastAPI app with lifespan mocked (no real Qdrant/scheduler)."""`
			`from contextlib import asynccontextmanager`

			`# Minimal curator prompt`
			`prompts_dir = tmp_path / "prompts"`
			`prompts_dir.mkdir()`
			`(prompts_dir / "curator_prompt.md").write_text("Curate. Date: {CURRENT_DATE}")`
			`(prompts_dir / "systemprompt.md").write_text("You are Vera.")`

			`@asynccontextmanager`
			`async def fake_lifespan(app):`
			`yield`

			`import app.main as main_module`

			`with patch.dict(os.environ, {"VERA_PROMPTS_DIR": str(prompts_dir)}), \`
			`patch("app.main.get_qdrant_service", return_value=mock_qdrant), \`
			`patch("app.singleton.get_qdrant_service", return_value=mock_qdrant), \`
			`patch("app.main.Curator") as MockCurator, \`
			`patch("app.main.scheduler") as mock_scheduler:`

			`mock_scheduler.add_job = MagicMock()`
			`mock_scheduler.start = MagicMock()`
			`mock_scheduler.shutdown = MagicMock()`

			`mock_curator_instance = MagicMock()`
			`mock_curator_instance.run = AsyncMock()`
			`MockCurator.return_value = mock_curator_instance`

			`from fastapi import FastAPI`
			`from fastapi.testclient import TestClient`

			`# Import fresh — use the real routes but swap lifespan`
			`from app.main import app as vera_app`
			`vera_app.router.lifespan_context = fake_lifespan`

			`yield vera_app, mock_qdrant`


			`# ---------------------------------------------------------------------------`
			`# Health check`
			`# ---------------------------------------------------------------------------`

			`class TestHealthCheck:`
			`def test_health_ollama_reachable(self, app_with_mocks):`
			`"""GET / returns status ok and ollama=reachable when Ollama is up."""`
			`from fastapi.testclient import TestClient`

			`vera_app, mock_qdrant = app_with_mocks`

			`mock_resp = MagicMock()`
			`mock_resp.status_code = 200`

			`mock_client_instance = AsyncMock()`
			`mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)`
			`mock_client_instance.__aexit__ = AsyncMock(return_value=False)`
			`mock_client_instance.get = AsyncMock(return_value=mock_resp)`

			`with patch("httpx.AsyncClient", return_value=mock_client_instance):`
			`with TestClient(vera_app, raise_server_exceptions=True) as client:`
			`resp = client.get("/")`

			`assert resp.status_code == 200`
			`body = resp.json()`
			`assert body["status"] == "ok"`
			`assert body["ollama"] == "reachable"`

			`def test_health_ollama_unreachable(self, app_with_mocks):`
			`"""GET / returns ollama=unreachable when Ollama is down."""`
			`import httpx`
			`from fastapi.testclient import TestClient`

			`vera_app, _ = app_with_mocks`

			`mock_client_instance = AsyncMock()`
			`mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)`
			`mock_client_instance.__aexit__ = AsyncMock(return_value=False)`
			`mock_client_instance.get = AsyncMock(side_effect=httpx.ConnectError("refused"))`

			`with patch("httpx.AsyncClient", return_value=mock_client_instance):`
			`with TestClient(vera_app, raise_server_exceptions=True) as client:`
			`resp = client.get("/")`

			`assert resp.status_code == 200`
			`assert resp.json()["ollama"] == "unreachable"`


			`# ---------------------------------------------------------------------------`
			`# /api/tags`
			`# ---------------------------------------------------------------------------`

			`class TestApiTags:`
			`def test_returns_model_list(self, app_with_mocks):`
			`"""GET /api/tags proxies Ollama tags."""`
			`from fastapi.testclient import TestClient`

			`vera_app, _ = app_with_mocks`

			`mock_resp = MagicMock()`
			`mock_resp.json.return_value = _ollama_tags_response()`

			`mock_client_instance = AsyncMock()`
			`mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)`
			`mock_client_instance.__aexit__ = AsyncMock(return_value=False)`
			`mock_client_instance.get = AsyncMock(return_value=mock_resp)`

			`with patch("httpx.AsyncClient", return_value=mock_client_instance):`
			`with TestClient(vera_app) as client:`
			`resp = client.get("/api/tags")`

			`assert resp.status_code == 200`
			`data = resp.json()`
			`assert "models" in data`
			`assert any(m["name"] == "llama3" for m in data["models"])`

			`def test_cloud_models_injected(self, tmp_path):`
			`"""Cloud models appear in /api/tags when cloud is enabled."""`
			`from fastapi.testclient import TestClient`
			`from contextlib import asynccontextmanager`

			`prompts_dir = tmp_path / "prompts"`
			`prompts_dir.mkdir()`
			`(prompts_dir / "curator_prompt.md").write_text("Curate.")`
			`(prompts_dir / "systemprompt.md").write_text("")`

			`mock_qdrant = _make_mock_qdrant()`

			`@asynccontextmanager`
			`async def fake_lifespan(app):`
			`yield`

			`from app.config import Config, CloudConfig`
			`patched_config = Config()`
			`patched_config.cloud = CloudConfig(`
			`enabled=True,`
			`models={"gpt-oss:120b": "openai/gpt-4o"},`
			`)`

			`mock_resp = MagicMock()`
			`mock_resp.json.return_value = {"models": []}`

			`mock_client_instance = AsyncMock()`
			`mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)`
			`mock_client_instance.__aexit__ = AsyncMock(return_value=False)`
			`mock_client_instance.get = AsyncMock(return_value=mock_resp)`

			`import app.main as main_module`

			`with patch.dict(os.environ, {"VERA_PROMPTS_DIR": str(prompts_dir)}), \`
			`patch("app.main.config", patched_config), \`
			`patch("app.main.get_qdrant_service", return_value=mock_qdrant), \`
			`patch("app.main.scheduler") as mock_scheduler, \`
			`patch("app.main.Curator") as MockCurator:`

			`mock_scheduler.add_job = MagicMock()`
			`mock_scheduler.start = MagicMock()`
			`mock_scheduler.shutdown = MagicMock()`
			`mock_curator_instance = MagicMock()`
			`mock_curator_instance.run = AsyncMock()`
			`MockCurator.return_value = mock_curator_instance`

			`from app.main import app as vera_app`
			`vera_app.router.lifespan_context = fake_lifespan`

			`with patch("httpx.AsyncClient", return_value=mock_client_instance):`
			`with TestClient(vera_app) as client:`
			`resp = client.get("/api/tags")`

			`data = resp.json()`
			`names = [m["name"] for m in data["models"]]`
			`assert "gpt-oss:120b" in names`


			`# ---------------------------------------------------------------------------`
			`# POST /api/chat (non-streaming)`
			`# ---------------------------------------------------------------------------`

			`class TestApiChatNonStreaming:`
			`def test_non_streaming_round_trip(self, app_with_mocks):`
			`"""POST /api/chat with stream=False returns Ollama response."""`
			`from fastapi.testclient import TestClient`
			`import app.utils as utils_module`
			`import app.proxy_handler as ph_module`

			`vera_app, mock_qdrant = app_with_mocks`

			`ollama_data = _ollama_chat_response("The answer is 42.")`

			`mock_post_resp = MagicMock()`
			`mock_post_resp.json.return_value = ollama_data`

			`mock_client_instance = AsyncMock()`
			`mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)`
			`mock_client_instance.__aexit__ = AsyncMock(return_value=False)`
			`mock_client_instance.post = AsyncMock(return_value=mock_post_resp)`

			`with patch.object(utils_module, "load_system_prompt", return_value=""), \`
			`patch.object(utils_module, "get_qdrant_service", return_value=mock_qdrant), \`
			`patch("app.proxy_handler.get_qdrant_service", return_value=mock_qdrant), \`
			`patch("httpx.AsyncClient", return_value=mock_client_instance):`

			`with TestClient(vera_app) as client:`
			`resp = client.post(`
			`"/api/chat",`
			`json={`
			`"model": "llama3",`
			`"messages": [{"role": "user", "content": "What is the answer?"}],`
			`"stream": False,`
			`},`
			`)`

			`assert resp.status_code == 200`
			`body = resp.json()`
			`assert body["message"]["content"] == "The answer is 42."`

			`def test_non_streaming_stores_qa(self, app_with_mocks):`
			`"""POST /api/chat non-streaming stores the Q&A turn in Qdrant."""`
			`from fastapi.testclient import TestClient`
			`import app.utils as utils_module`

			`vera_app, mock_qdrant = app_with_mocks`

			`ollama_data = _ollama_chat_response("42.")`

			`mock_post_resp = MagicMock()`
			`mock_post_resp.json.return_value = ollama_data`

			`mock_client_instance = AsyncMock()`
			`mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)`
			`mock_client_instance.__aexit__ = AsyncMock(return_value=False)`
			`mock_client_instance.post = AsyncMock(return_value=mock_post_resp)`

			`with patch.object(utils_module, "load_system_prompt", return_value=""), \`
			`patch.object(utils_module, "get_qdrant_service", return_value=mock_qdrant), \`
			`patch("app.proxy_handler.get_qdrant_service", return_value=mock_qdrant), \`
			`patch("httpx.AsyncClient", return_value=mock_client_instance):`

			`with TestClient(vera_app) as client:`
			`client.post(`
			`"/api/chat",`
			`json={`
			`"model": "llama3",`
			`"messages": [{"role": "user", "content": "What is 6*7?"}],`
			`"stream": False,`
			`},`
			`)`

			`mock_qdrant.store_qa_turn.assert_called_once()`
			`args = mock_qdrant.store_qa_turn.call_args[0]`
			`assert "6*7" in args[0]`
			`assert "42." in args[1]`


			`# ---------------------------------------------------------------------------`
			`# POST /api/chat (streaming)`
			`# ---------------------------------------------------------------------------`

			`class TestApiChatStreaming:`
			`def test_streaming_response_passthrough(self, app_with_mocks):`
			`"""POST /api/chat with stream=True streams Ollama chunks."""`
			`from fastapi.testclient import TestClient`
			`import app.utils as utils_module`
			`import app.proxy_handler as ph_module`

			`vera_app, mock_qdrant = app_with_mocks`

			`chunk1 = json.dumps({"message": {"content": "Hello"}, "done": False}).encode()`
			`chunk2 = json.dumps({"message": {"content": " world"}, "done": True}).encode()`

			`async def fake_aiter_bytes():`
			`yield chunk1`
			`yield chunk2`

			`mock_stream_resp = MagicMock()`
			`mock_stream_resp.aiter_bytes = fake_aiter_bytes`
			`mock_stream_resp.status_code = 200`
			`mock_stream_resp.headers = {"content-type": "application/x-ndjson"}`

			`mock_client_instance = AsyncMock()`
			`mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance)`
			`mock_client_instance.__aexit__ = AsyncMock(return_value=False)`
			`mock_client_instance.post = AsyncMock(return_value=mock_stream_resp)`

			`with patch.object(utils_module, "load_system_prompt", return_value=""), \`
			`patch.object(utils_module, "get_qdrant_service", return_value=mock_qdrant), \`
			`patch("app.proxy_handler.get_qdrant_service", return_value=mock_qdrant), \`
			`patch("httpx.AsyncClient", return_value=mock_client_instance):`

			`with TestClient(vera_app) as client:`
			`resp = client.post(`
			`"/api/chat",`
			`json={`
			`"model": "llama3",`
			`"messages": [{"role": "user", "content": "Say hello"}],`
			`"stream": True,`
			`},`
			`)`

			`assert resp.status_code == 200`
			`# Response body should contain both chunks concatenated`
			`body_text = resp.text`
			`assert "Hello" in body_text or len(body_text) > 0`