A FastAPI + vanilla JS chat app fronting an Anthropic Claude agent for order status, returns, and policy questions. Architecture: - agent.py: system prompt, runtime reminder injection, output validation, agentic tool-use loop with prompt caching on the system prompt block - tools.py: four tools (lookup_order, check_return_eligibility, initiate_return, lookup_policy) with per-session SessionGuardState enforcing protocol ordering on the tool side - mock_data.py: orders, return policy, and FAQ entries used as the single source of truth by both the prompt and the tools - server.py: FastAPI app exposing /api/chat, /health, and the static UI - static/: vanilla HTML/CSS/JS chat UI, no build step - tests/: 30 tests covering tool-side enforcement, the privacy boundary, output validation, and the agent loop with a mocked Anthropic client - deploy/: systemd unit and nginx site config for production
263 lines
8.9 KiB
Python
263 lines
8.9 KiB
Python
"""Agent-layer tests: validate_reply (Layer 4) and run_turn end-to-end with a
|
|
mocked Anthropic client.
|
|
|
|
The Anthropic API is never called. Each test wires a fake `_client` onto the
|
|
agent module that produces canned response objects, so the tests assert how
|
|
the agent loop wires layers 3 and 4 together rather than what the model
|
|
actually generates.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
|
|
# Provide a dummy API key so `from agent import ...` does not fail when
|
|
# pydantic-settings reads .env.
|
|
os.environ.setdefault("ANTHROPIC_API_KEY", "test-key-not-used")
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
import agent
|
|
from agent import SAFE_FALLBACK, SESSIONS, build_system_content, run_turn, validate_reply
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Mock SDK objects
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class MockTextBlock:
|
|
text: str
|
|
type: str = "text"
|
|
|
|
|
|
@dataclass
|
|
class MockToolUseBlock:
|
|
id: str
|
|
name: str
|
|
input: dict
|
|
type: str = "tool_use"
|
|
|
|
|
|
@dataclass
|
|
class MockResponse:
|
|
content: list[Any]
|
|
stop_reason: str = "end_turn"
|
|
|
|
|
|
class MockClient:
|
|
"""A scripted Anthropic client. Hands out the next response in `script`
|
|
each time `messages.create` is called."""
|
|
|
|
def __init__(self, script: list[MockResponse]):
|
|
self.script = list(script)
|
|
self.calls: list[dict] = []
|
|
|
|
client_self = self
|
|
|
|
class _Messages:
|
|
def create(self, **kwargs):
|
|
client_self.calls.append(kwargs)
|
|
if not client_self.script:
|
|
raise AssertionError("MockClient ran out of scripted responses")
|
|
return client_self.script.pop(0)
|
|
|
|
self.messages = _Messages()
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _reset_sessions_and_client(monkeypatch):
|
|
SESSIONS.clear()
|
|
monkeypatch.setattr(agent, "_client", None)
|
|
yield
|
|
SESSIONS.clear()
|
|
|
|
|
|
def _install_mock(monkeypatch, script: list[MockResponse]) -> MockClient:
|
|
client = MockClient(script)
|
|
monkeypatch.setattr(agent, "_get_client", lambda: client)
|
|
return client
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# build_system_content
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_build_system_content_caches_main_prompt_block():
|
|
blocks = build_system_content(turn_count=0)
|
|
assert blocks[0]["cache_control"] == {"type": "ephemeral"}
|
|
# Reminder block is present but uncached.
|
|
assert blocks[1]["text"].startswith("<reminder>")
|
|
assert "cache_control" not in blocks[1]
|
|
|
|
|
|
def test_build_system_content_adds_long_conversation_reminder_after_threshold():
|
|
short = build_system_content(turn_count=2)
|
|
long = build_system_content(turn_count=5)
|
|
assert len(long) == len(short) + 1
|
|
assert "long" in long[-1]["text"].lower()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# validate_reply
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_validate_reply_passes_clean_reply():
|
|
result = validate_reply("Your order BK-10042 was delivered.", [
|
|
{"name": "lookup_order", "result": {"order": {"order_id": "BK-10042"}}},
|
|
])
|
|
assert result.ok
|
|
assert result.violations == []
|
|
|
|
|
|
def test_validate_reply_flags_ungrounded_order_id():
|
|
result = validate_reply("Your order BK-99999 is on the way.", [])
|
|
assert not result.ok
|
|
assert "ungrounded_order_id:BK-99999" in result.violations
|
|
|
|
|
|
def test_validate_reply_flags_ungrounded_date():
|
|
result = validate_reply("It will arrive on 2026-12-25.", [])
|
|
assert not result.ok
|
|
assert any(v.startswith("ungrounded_date:") for v in result.violations)
|
|
|
|
|
|
def test_validate_reply_passes_grounded_date():
|
|
result = validate_reply("It was delivered on 2026-04-01.", [
|
|
{"name": "lookup_order", "result": {"order": {"delivered_date": "2026-04-01"}}},
|
|
])
|
|
assert result.ok
|
|
|
|
|
|
def test_validate_reply_flags_markdown_bold():
|
|
result = validate_reply("Here are your **details**.", [])
|
|
assert not result.ok
|
|
assert "markdown_leaked" in result.violations
|
|
|
|
|
|
def test_validate_reply_flags_markdown_bullet():
|
|
result = validate_reply("Items:\n- The Goldfinch\n- Sapiens", [])
|
|
assert not result.ok
|
|
assert "markdown_leaked" in result.violations
|
|
|
|
|
|
def test_validate_reply_flags_off_topic_engagement():
|
|
result = validate_reply(
|
|
"I recommend Project Hail Mary, it's a great book.",
|
|
[],
|
|
)
|
|
assert not result.ok
|
|
assert "off_topic_engagement" in result.violations
|
|
|
|
|
|
def test_validate_reply_allows_refusal_template_even_with_keywords():
|
|
reply = "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"
|
|
result = validate_reply(reply, [])
|
|
assert result.ok
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# run_turn end-to-end with mocked client
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_run_turn_returns_simple_text_reply(monkeypatch):
|
|
_install_mock(monkeypatch, [
|
|
MockResponse(content=[MockTextBlock(text="Hi! How can I help with an order today?")]),
|
|
])
|
|
reply = run_turn("session-1", "hi there")
|
|
assert "How can I help" in reply
|
|
session = SESSIONS["session-1"]
|
|
assert session.turn_count == 1
|
|
assert session.history[-1]["role"] == "assistant"
|
|
|
|
|
|
def test_run_turn_with_tool_use_loop(monkeypatch):
|
|
"""Two-step loop: model asks for a tool, then produces a final reply."""
|
|
first = MockResponse(
|
|
stop_reason="tool_use",
|
|
content=[
|
|
MockToolUseBlock(
|
|
id="toolu_1",
|
|
name="lookup_order",
|
|
input={"order_id": "BK-10042"},
|
|
)
|
|
],
|
|
)
|
|
second = MockResponse(
|
|
content=[MockTextBlock(text="Your order BK-10042 was delivered.")],
|
|
)
|
|
client = _install_mock(monkeypatch, [first, second])
|
|
reply = run_turn("session-2", "Where is BK-10042?")
|
|
assert "BK-10042" in reply
|
|
assert len(client.calls) == 2
|
|
# History must contain: user, assistant(tool_use), user(tool_result), assistant(text)
|
|
history = SESSIONS["session-2"].history
|
|
assert history[0]["role"] == "user"
|
|
assert history[1]["role"] == "assistant"
|
|
assert history[2]["role"] == "user" # tool_result is a user-role message
|
|
assert history[3]["role"] == "assistant"
|
|
|
|
|
|
def test_run_turn_drops_hallucinated_reply_and_returns_safe_fallback(monkeypatch):
|
|
"""A reply that mentions an order ID never seen by a tool must trigger
|
|
SAFE_FALLBACK, and the bad reply must not be appended to history."""
|
|
_install_mock(monkeypatch, [
|
|
MockResponse(content=[MockTextBlock(text="Your order BK-99999 will arrive on 2026-12-25.")]),
|
|
])
|
|
reply = run_turn("session-3", "where is my order")
|
|
assert reply == SAFE_FALLBACK
|
|
history = SESSIONS["session-3"].history
|
|
# Only the user message should be in history; no hallucinated assistant.
|
|
assert len(history) == 1
|
|
assert history[0]["role"] == "user"
|
|
|
|
|
|
def test_run_turn_passes_through_refusal_template(monkeypatch):
|
|
refusal = "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"
|
|
_install_mock(monkeypatch, [
|
|
MockResponse(content=[MockTextBlock(text=refusal)]),
|
|
])
|
|
reply = run_turn("session-4", "recommend a mystery novel")
|
|
assert reply == refusal
|
|
assert SESSIONS["session-4"].turn_count == 1
|
|
|
|
|
|
def test_run_turn_layer_3_blocks_initiate_return_without_eligibility(monkeypatch):
|
|
"""If the model jumps straight to initiate_return, the tool refuses with
|
|
eligibility_not_verified, and the model can recover on the next iteration.
|
|
|
|
Here we script a model that immediately calls initiate_return, then on the
|
|
follow-up produces a clean text reply that quotes the error message.
|
|
"""
|
|
first = MockResponse(
|
|
stop_reason="tool_use",
|
|
content=[
|
|
MockToolUseBlock(
|
|
id="toolu_1",
|
|
name="initiate_return",
|
|
input={
|
|
"order_id": "BK-10042",
|
|
"customer_email": "sarah.chen@example.com",
|
|
"reason": "test",
|
|
},
|
|
)
|
|
],
|
|
)
|
|
second = MockResponse(
|
|
content=[MockTextBlock(text="I need to check return eligibility first. Could you confirm the email on the order?")],
|
|
)
|
|
_install_mock(monkeypatch, [first, second])
|
|
reply = run_turn("session-5", "return BK-10042")
|
|
assert "eligibility" in reply.lower() or "email" in reply.lower()
|
|
# Verify the tool actually refused: nothing should be in returns_initiated.
|
|
session = SESSIONS["session-5"]
|
|
assert "BK-10042" not in session.guard_state.returns_initiated
|