From 30cdea2aac37bab6348573c4f47c301108369393 Mon Sep 17 00:00:00 2001
From: Cody Borders <cody@codyborders.com>
Date: Tue, 14 Apr 2026 22:17:59 -0700
Subject: [PATCH] Build Bookly customer support agent

A FastAPI + vanilla JS chat app fronting an Anthropic Claude agent for
order status, returns, and policy questions.

Architecture:
- agent.py: system prompt, runtime reminder injection, output validation,
  agentic tool-use loop with prompt caching on the system prompt block
- tools.py: four tools (lookup_order, check_return_eligibility,
  initiate_return, lookup_policy) with per-session SessionGuardState
  enforcing protocol ordering on the tool side
- mock_data.py: orders, return policy, and FAQ entries used as the single
  source of truth by both the prompt and the tools
- server.py: FastAPI app exposing /api/chat, /health, and the static UI
- static/: vanilla HTML/CSS/JS chat UI, no build step
- tests/: 30 tests covering tool-side enforcement, the privacy boundary,
  output validation, and the agent loop with a mocked Anthropic client
- deploy/: systemd unit and nginx site config for production
---
 .env.example             |   1 +
 .gitignore               |   7 +
 README.md                |  53 +++++
 agent.py                 | 411 +++++++++++++++++++++++++++++++++++++++
 config.py                |  21 ++
 deploy/bookly.nginx.conf |  18 ++
 deploy/bookly.service    |  18 ++
 mock_data.py             | 109 +++++++++++
 requirements.txt         |   7 +
 server.py                |  50 +++++
 static/chat.js           |  82 ++++++++
 static/index.html        |  30 +++
 static/style.css         | 161 +++++++++++++++
 tests/__init__.py        |   0
 tests/test_agent.py      | 262 +++++++++++++++++++++++++
 tests/test_tools.py      | 183 +++++++++++++++++
 tools.py                 | 295 ++++++++++++++++++++++++++++
 17 files changed, 1708 insertions(+)
 create mode 100644 .env.example
 create mode 100644 .gitignore
 create mode 100644 agent.py
 create mode 100644 config.py
 create mode 100644 deploy/bookly.nginx.conf
 create mode 100644 deploy/bookly.service
 create mode 100644 mock_data.py
 create mode 100644 requirements.txt
 create mode 100644 server.py
 create mode 100644 static/chat.js
 create mode 100644 static/index.html
 create mode 100644 static/style.css
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_agent.py
 create mode 100644 tests/test_tools.py
 create mode 100644 tools.py

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..53c1030
--- /dev/null
+++ b/.env.example
@@ -0,0 +1 @@
+ANTHROPIC_API_KEY=sk-ant-...
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..60a4f31
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+venv/
+.env
+__pycache__/
+*.pyc
+.pytest_cache/
+.DS_Store
+DESIGN.md
diff --git a/README.md b/README.md
index a4931c1..7e37b4e 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,54 @@
 # Bookly
+
+A conversational customer support agent for a fictional online bookstore.
+
+The agent handles two depth use cases (order status and returns) and one breadth use case (policy questions) over a vanilla HTML chat UI, backed by Anthropic Claude with a four-layer guardrail strategy. The full agent design rationale lives in `DESIGN.md`.
+
+## Stack
+
+- Python 3.11+, FastAPI, Uvicorn
+- Anthropic Claude (Sonnet) with prompt caching
+- Vanilla HTML / CSS / JS chat frontend (no build step)
+- Pytest
+
+## Setup
+
+```
+python3 -m venv venv
+./venv/bin/pip install -r requirements.txt
+cp .env.example .env
+# edit .env and set ANTHROPIC_API_KEY
+```
+
+## Run locally
+
+```
+./venv/bin/uvicorn server:app --host 127.0.0.1 --port 8014
+```
+
+Then open <http://127.0.0.1:8014> in a browser.
+
+## Tests
+
+```
+./venv/bin/python -m pytest tests/ -v
+```
+
+Tests mock the Anthropic client, so no API key or network access is required.
+
+## Project layout
+
+```
+agent.py        System prompt, guardrails (layers 1, 2, 4), agentic loop
+tools.py        Tool schemas, handlers, SessionGuardState (layer 3)
+mock_data.py    Orders, return policy, FAQ policies
+server.py       FastAPI app: /api/chat, /health, static mount
+config.py       pydantic-settings config loaded from .env
+static/         index.html, style.css, chat.js
+tests/          test_tools.py, test_agent.py
+deploy/         systemd unit + nginx site config for the production droplet
+```
+
+## Design
+
+See `DESIGN.md` for the architecture, conversation design, hallucination and safety controls, and production-readiness tradeoffs.
diff --git a/agent.py b/agent.py
new file mode 100644
index 0000000..9f1cea1
--- /dev/null
+++ b/agent.py
@@ -0,0 +1,411 @@
+"""Bookly agent: system prompt, guardrails, and the agentic loop.
+
+This module wires four guardrail layers together:
+
+1. The system prompt itself (XML-tagged, primacy+recency duplication, verbatim
+   policy block, refusal template, few-shot examples for edge cases).
+2. Runtime reminder injection: a short "non-negotiable rules" block appended
+   to the system content on every turn, plus a stronger reminder once the
+   conversation gets long enough that the original prompt has decayed in
+   effective attention.
+3. Tool-side enforcement (lives in `tools.py`): handlers refuse unsafe calls
+   regardless of what the model decides.
+4. Output validation: deterministic regex checks on the final reply for
+   ungrounded order IDs/dates, markdown leakage, and off-topic engagement
+   without the refusal template. On failure, the bad reply is dropped and the
+   user gets a safe canned message — and the bad reply is never appended to
+   history, so it cannot poison subsequent turns.
+
+Anthropic prompt caching is enabled on the large system-prompt block so the
+per-turn cost stays low across a conversation.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Any
+
+from anthropic import Anthropic
+
+from config import settings
+from tools import SessionGuardState, TOOL_SCHEMAS, dispatch_tool
+from mock_data import POLICIES, RETURN_POLICY
+
+logger = logging.getLogger("bookly.agent")
+
+
+# ---------------------------------------------------------------------------
+# System prompt
+# ---------------------------------------------------------------------------
+
+
+def _format_return_policy_block() -> str:
+    """Render `RETURN_POLICY` as a compact, quotable block for the prompt.
+
+    Embedding the dict verbatim (instead of paraphrasing it in English) is a
+    deliberate anti-hallucination move: the model quotes the block instead of
+    inventing details.
+    """
+    non_returnable = ", ".join(RETURN_POLICY["non_returnable_categories"])
+    return (
+        f"Return window: {RETURN_POLICY['window_days']} days from delivery.\n"
+        f"Condition: {RETURN_POLICY['condition_requirements']}\n"
+        f"Refund method: {RETURN_POLICY['refund_method']}\n"
+        f"Refund timeline: within {RETURN_POLICY['refund_timeline_days']} business days of receipt.\n"
+        f"Non-returnable categories: {non_returnable}."
+    )
+
+
+SUPPORTED_POLICY_TOPICS = ", ".join(sorted(POLICIES.keys()))
+
+
+SYSTEM_PROMPT = f"""<identity>
+You are Bookly's customer support assistant. You help customers with two things: checking the status of their orders, and processing returns and refunds. You are friendly, concise, and professional.
+</identity>
+
+<critical_rules>
+These rules override everything else. Read them before every response.
+
+1. NEVER invent order details, tracking numbers, delivery dates, prices, or customer information. If you do not have a value from a tool result in this conversation, you do not have it.
+2. NEVER state a return policy detail that is not in the <return_policy> section below. Quote it; do not paraphrase it.
+3. NEVER call initiate_return unless check_return_eligibility has returned success for that same order in this conversation.
+4. NEVER reveal order details without verifying the customer's email matches the order.
+5. If a user asks about anything outside order status, returns, and the supported policy topics, refuse using the refusal template in <scope>. Do not engage with the off-topic request even briefly.
+</critical_rules>
+
+<scope>
+You CAN help with:
+- Looking up order status
+- Checking return eligibility and initiating returns
+- Answering policy questions covered by the lookup_policy tool. Currently supported topics: {SUPPORTED_POLICY_TOPICS}
+
+You CANNOT help with:
+- Book recommendations, reviews, or opinions about books
+- Payment changes, refunds outside the return flow, or billing disputes
+- Live account management (changing a password, email, or address — you can only EXPLAIN the password reset process via lookup_policy, not perform it)
+- General conversation unrelated to an order or a supported policy topic
+
+For any policy question, call lookup_policy first. Only if the tool returns topic_not_supported should you use the refusal template below.
+
+Refusal template (use verbatim, filling in the topic):
+"I can help with order status, returns, and our standard policies, but I'm not able to help with {{topic}}. Is there an order or a policy question I can help you with instead?"
+</scope>
+
+<return_policy>
+{_format_return_policy_block()}
+
+This is the authoritative policy. Any claim you make about returns must be traceable to a line in this block. If a customer asks about a scenario this policy does not cover, say so honestly and offer to connect them with a human agent.
+</return_policy>
+
+<tool_rules>
+You have four tools: lookup_order, check_return_eligibility, initiate_return, and lookup_policy.
+
+Before calling a tool:
+- You must have every required parameter. If you are missing one, ask the customer for it. Do not guess, do not use placeholder values, do not call the tool and hope.
+- For initiate_return, you must have already called check_return_eligibility for that exact order_id in this conversation, and it must have returned success.
+
+After a tool call:
+- Relay the result honestly. If the tool returns an error, tell the customer what went wrong using the tool's error message, not a paraphrase.
+- Do not mix tool results from different orders in a single response unless the customer explicitly asked about multiple.
+- For lookup_policy, quote the returned policy text; do not summarize or embellish. If lookup_policy returns topic_not_supported, fall through to the refusal template in <scope>.
+</tool_rules>
+
+<clarifying_rules>
+Ask one clarifying question at a time, not a list. Common cases:
+
+- Customer mentions "my order" without an order ID: ask for the order ID. Tell them it starts with "BK-" and is in their confirmation email.
+- Customer gives an order ID but no email, and wants a return: ask for the email on the order.
+- A customer has multiple orders and was ambiguous: ask which order they mean, listing them by ID and status only.
+- Customer wants to initiate a return: after eligibility is confirmed, summarize what will happen (which items, refund method, timeline) and ask for explicit confirmation before calling initiate_return.
+</clarifying_rules>
+
+<tone>
+- Friendly and warm, but not chatty. One or two sentences per turn is usually right.
+- Use the customer's first name once you know it, but not in every message.
+- Plain text only. No markdown, no bullet points, no headers, no asterisks for emphasis. The chat UI does not render markdown.
+- Never apologize more than once for the same issue.
+</tone>
+
+<examples>
+Example 1 — missing order ID:
+User: "Where's my order?"
+Assistant: "Happy to check on that for you. Could you share your order ID? It starts with 'BK-' and you'll find it in your order confirmation email."
+
+Example 2 — policy question (supported):
+User: "How do I reset my password?"
+Assistant (after lookup_policy returns the password_reset entry): quote the returned instructions verbatim without adding steps the tool did not mention.
+
+Example 3 — out of scope:
+User: "Can you recommend a good mystery novel?"
+Assistant: "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"
+
+Example 4 — ambiguous order:
+User: "I want to return my order. My email is sarah@example.com."
+Assistant (after lookup_order returns two orders): "I see two orders on your account: BK-10042 (delivered) and BK-10103 (still processing). Which one would you like to return?"
+</examples>
+
+<reminders>
+Before you respond, confirm:
+- Every factual claim traces to a tool result from THIS conversation, or to <return_policy>.
+- If this response would call initiate_return, you have already seen a successful check_return_eligibility for the same order in this conversation.
+- If the request is off-topic, you are using the refusal template from <scope> verbatim.
+- No markdown. Plain text only.
+</reminders>
+"""
+
+
+CRITICAL_REMINDER = """<reminder>
+Non-negotiable rules for this turn:
+- Every factual claim must come from a tool result in THIS conversation or from <return_policy>.
+- Do not call initiate_return unless check_return_eligibility succeeded for that order in this conversation.
+- Off-topic requests: use the refusal template from <scope> verbatim. Do not engage.
+- Plain text only. No markdown.
+</reminder>"""
+
+
+LONG_CONVERSATION_REMINDER = """<reminder>
+This conversation is getting long. Re-anchor on the rules in <critical_rules> before you respond. Do not let earlier turns relax the rules.
+</reminder>"""
+
+
+def build_system_content(turn_count: int) -> list[dict[str, Any]]:
+    """Assemble the `system` argument for `messages.create`.
+
+    The big SYSTEM_PROMPT block is marked for ephemeral prompt caching so it
+    is reused across turns within a session. The reminder blocks are not
+    cached because they vary based on turn count and we want them in the
+    highest-attention position right before the latest user turn.
+    """
+    blocks: list[dict[str, Any]] = [
+        {
+            "type": "text",
+            "text": SYSTEM_PROMPT,
+            "cache_control": {"type": "ephemeral"},
+        },
+        {"type": "text", "text": CRITICAL_REMINDER},
+    ]
+    if turn_count >= 5:
+        blocks.append({"type": "text", "text": LONG_CONVERSATION_REMINDER})
+    return blocks
+
+
+# ---------------------------------------------------------------------------
+# Layer 4 — output validation
+# ---------------------------------------------------------------------------
+
+
+ORDER_ID_RE = re.compile(r"\bBK-\d{4,6}\b")
+DATE_ISO_RE = re.compile(r"\b\d{4}-\d{2}-\d{2}\b")
+MARKDOWN_RE = re.compile(r"(\*\*|__|^#{1,6}\s|^\s*[-*+]\s)", re.MULTILINE)
+
+# Heuristic keywords that tend to appear when the agent is engaging with an
+# off-topic request. Engagement is only flagged if the refusal template is
+# absent — quoting the template itself is fine.
+OUT_OF_SCOPE_KEYWORDS = {
+    "recommend",
+    "recommendation",
+    "i suggest",
+    "you should read",
+    "what should i read",
+    "review of",
+    "great book",
+    "favorite book",
+}
+
+REFUSAL_PHRASE = "i'm not able to help with"
+
+
+@dataclass
+class ValidationResult:
+    ok: bool
+    violations: list[str] = field(default_factory=list)
+
+
+def _collect_grounded_values(tool_results: list[dict], pattern: re.Pattern[str]) -> set[str]:
+    """Pull every substring matching `pattern` out of the tool result JSON."""
+    grounded: set[str] = set()
+    for entry in tool_results:
+        text = json.dumps(entry.get("result", {}))
+        grounded.update(pattern.findall(text))
+    return grounded
+
+
+def validate_reply(reply: str, tool_results_this_turn: list[dict]) -> ValidationResult:
+    """Run deterministic checks on the final assistant reply.
+
+    Heuristic, not exhaustive. Catches the cheap wins — fabricated order IDs,
+    made-up dates, markdown leakage, and obvious off-topic engagement. For
+    anything subtler we rely on layers 1–3.
+    """
+    assert isinstance(reply, str), "reply must be a string"
+    assert isinstance(tool_results_this_turn, list), "tool_results_this_turn must be a list"
+
+    violations: list[str] = []
+
+    grounded_ids = _collect_grounded_values(tool_results_this_turn, ORDER_ID_RE)
+    for match in ORDER_ID_RE.findall(reply):
+        if match not in grounded_ids:
+            violations.append(f"ungrounded_order_id:{match}")
+
+    grounded_dates = _collect_grounded_values(tool_results_this_turn, DATE_ISO_RE)
+    for match in DATE_ISO_RE.findall(reply):
+        if match not in grounded_dates:
+            violations.append(f"ungrounded_date:{match}")
+
+    if MARKDOWN_RE.search(reply):
+        violations.append("markdown_leaked")
+
+    lowered = reply.lower()
+    engaged_off_topic = any(kw in lowered for kw in OUT_OF_SCOPE_KEYWORDS)
+    if engaged_off_topic and REFUSAL_PHRASE not in lowered:
+        violations.append("off_topic_engagement")
+
+    return ValidationResult(ok=not violations, violations=violations)
+
+
+# ---------------------------------------------------------------------------
+# Session and agent loop
+# ---------------------------------------------------------------------------
+
+
+SAFE_FALLBACK = (
+    "I hit a problem generating a response. Could you rephrase your question, "
+    "or share an order ID so I can try again?"
+)
+
+
+@dataclass
+class Session:
+    history: list[dict[str, Any]] = field(default_factory=list)
+    guard_state: SessionGuardState = field(default_factory=SessionGuardState)
+    turn_count: int = 0
+
+
+# Global session store keyed by session_id. The server module owns the
+# lifetime of these — agent.py only reads/writes them through `run_turn`.
+SESSIONS: dict[str, Session] = {}
+
+
+def get_or_create_session(session_id: str) -> Session:
+    assert isinstance(session_id, str) and session_id, "session_id is required"
+    session = SESSIONS.get(session_id)
+    if session is None:
+        session = Session()
+        SESSIONS[session_id] = session
+    return session
+
+
+# Lazily initialized so unit tests can monkeypatch _client without tripping
+# the missing-env-var failure path.
+_client: Anthropic | None = None
+
+
+def _get_client() -> Anthropic:
+    global _client
+    if _client is None:
+        _client = Anthropic(api_key=settings.anthropic_api_key)
+    return _client
+
+
+def _extract_text(content_blocks: list[Any]) -> str:
+    parts: list[str] = []
+    for block in content_blocks:
+        if getattr(block, "type", None) == "text":
+            parts.append(getattr(block, "text", "") or "")
+    return "".join(parts).strip()
+
+
+def _serialize_assistant_content(content_blocks: list[Any]) -> list[dict]:
+    """Convert SDK content blocks back into JSON-serializable dicts for history."""
+    serialized: list[dict] = []
+    for block in content_blocks:
+        block_type = getattr(block, "type", None)
+        if block_type == "text":
+            serialized.append({"type": "text", "text": getattr(block, "text", "") or ""})
+        elif block_type == "tool_use":
+            serialized.append(
+                {
+                    "type": "tool_use",
+                    "id": getattr(block, "id", None),
+                    "name": getattr(block, "name", None),
+                    "input": getattr(block, "input", None),
+                }
+            )
+    return serialized
+
+
+def run_turn(session_id: str, user_message: str) -> str:
+    """Run one user turn end-to-end and return the assistant's reply text.
+
+    Wires together: history append, system content with reminders, the
+    tool-use loop, output validation, and the safe-fallback path on
+    validation failure.
+    """
+    assert isinstance(user_message, str) and user_message.strip(), "user_message is required"
+
+    session = get_or_create_session(session_id)
+    session.history.append({"role": "user", "content": user_message})
+
+    system_content = build_system_content(session.turn_count)
+    client = _get_client()
+
+    tool_results_this_turn: list[dict] = []
+
+    def _call_model() -> Any:
+        return client.messages.create(
+            model=settings.anthropic_model,
+            max_tokens=settings.max_tokens,
+            system=system_content,
+            tools=TOOL_SCHEMAS,
+            messages=session.history,
+        )
+
+    response = _call_model()
+
+    # Tool-use loop: keep dispatching tools until the model returns end_turn.
+    while getattr(response, "stop_reason", None) == "tool_use":
+        assistant_blocks = _serialize_assistant_content(response.content)
+        session.history.append({"role": "assistant", "content": assistant_blocks})
+
+        tool_result_blocks: list[dict] = []
+        for block in response.content:
+            if getattr(block, "type", None) != "tool_use":
+                continue
+            name = getattr(block, "name")
+            args = getattr(block, "input", None) or {}
+            tool_id = getattr(block, "id")
+            result = dispatch_tool(name, args, session.guard_state)
+            tool_results_this_turn.append({"name": name, "result": result})
+            tool_result_blocks.append(
+                {
+                    "type": "tool_result",
+                    "tool_use_id": tool_id,
+                    "content": json.dumps(result),
+                }
+            )
+
+        session.history.append({"role": "user", "content": tool_result_blocks})
+        response = _call_model()
+
+    reply_text = _extract_text(response.content)
+    validation = validate_reply(reply_text, tool_results_this_turn)
+    if not validation.ok:
+        logger.warning(
+            "validation_failed session=%s turn=%s violations=%s reply=%r",
+            session_id,
+            session.turn_count,
+            validation.violations,
+            reply_text,
+        )
+        # Do NOT append the bad reply to history — that would poison future turns.
+        session.turn_count += 1
+        return SAFE_FALLBACK
+
+    session.history.append(
+        {"role": "assistant", "content": _serialize_assistant_content(response.content)}
+    )
+    session.turn_count += 1
+    return reply_text
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..7d6b417
--- /dev/null
+++ b/config.py
@@ -0,0 +1,21 @@
+"""Application configuration loaded from environment variables.
+
+Settings are read from `.env` at process start. The Anthropic API key is the
+only required secret; everything else has a sensible default so the app can
+boot in dev without ceremony.
+"""
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
+
+    anthropic_api_key: str
+    anthropic_model: str = "claude-sonnet-4-5"
+    max_tokens: int = 1024
+    server_host: str = "127.0.0.1"
+    server_port: int = 8014
+
+
+settings = Settings()  # type: ignore[call-arg]
diff --git a/deploy/bookly.nginx.conf b/deploy/bookly.nginx.conf
new file mode 100644
index 0000000..86d9b75
--- /dev/null
+++ b/deploy/bookly.nginx.conf
@@ -0,0 +1,18 @@
+server {
+    listen 80;
+    server_name bookly.codyborders.com;
+
+    access_log /var/log/nginx/bookly.access.log;
+    error_log  /var/log/nginx/bookly.error.log;
+
+    location / {
+        proxy_pass http://127.0.0.1:8014;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_read_timeout 120s;
+        proxy_send_timeout 120s;
+    }
+}
diff --git a/deploy/bookly.service b/deploy/bookly.service
new file mode 100644
index 0000000..0073dbd
--- /dev/null
+++ b/deploy/bookly.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Bookly customer support agent
+After=network.target
+
+[Service]
+Type=simple
+User=bookly
+Group=bookly
+WorkingDirectory=/opt/bookly
+EnvironmentFile=/opt/bookly/.env
+ExecStart=/opt/bookly/venv/bin/uvicorn server:app --host 127.0.0.1 --port 8014
+Restart=on-failure
+RestartSec=3
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=multi-user.target
diff --git a/mock_data.py b/mock_data.py
new file mode 100644
index 0000000..55cbaa8
--- /dev/null
+++ b/mock_data.py
@@ -0,0 +1,109 @@
+"""In-memory data fixtures for orders, returns, and FAQ policies.
+
+`ORDERS` and `RETURN_POLICY` are read by both the system prompt (so the prompt
+quotes policy verbatim instead of paraphrasing) and the tool handlers (so the
+two never drift apart). `RETURNS` is mutated by `initiate_return` at runtime.
+"""
+
+from datetime import date, timedelta
+
+# A frozen "today" so the four-order fixture stays deterministic across runs.
+TODAY = date(2026, 4, 14)
+
+
+def _days_ago(n: int) -> str:
+    return (TODAY - timedelta(days=n)).isoformat()
+
+
+RETURN_POLICY: dict = {
+    "window_days": 30,
+    "condition_requirements": "Items must be unread, undamaged, and in their original packaging.",
+    "refund_method": "Refunds are issued to the original payment method.",
+    "refund_timeline_days": 7,
+    "non_returnable_categories": ["ebooks", "audiobooks", "gift cards", "personalized items"],
+}
+
+
+# Four orders covering the interesting scenarios. Sarah Chen has two orders so
+# the agent must disambiguate when she says "my order".
+ORDERS: dict = {
+    "BK-10042": {
+        "order_id": "BK-10042",
+        "customer_name": "Sarah Chen",
+        "email": "sarah.chen@example.com",
+        "status": "delivered",
+        "order_date": _days_ago(20),
+        "delivered_date": _days_ago(15),
+        "tracking_number": "1Z999AA10123456784",
+        "items": [
+            {"title": "The Goldfinch", "author": "Donna Tartt", "price": 16.99, "category": "fiction"},
+            {"title": "Sapiens", "author": "Yuval Noah Harari", "price": 19.99, "category": "nonfiction"},
+        ],
+        "total": 36.98,
+    },
+    "BK-10089": {
+        "order_id": "BK-10089",
+        "customer_name": "James Murphy",
+        "email": "james.murphy@example.com",
+        "status": "shipped",
+        "order_date": _days_ago(4),
+        "delivered_date": None,
+        "tracking_number": "1Z999AA10987654321",
+        "items": [
+            {"title": "Project Hail Mary", "author": "Andy Weir", "price": 18.99, "category": "fiction"},
+        ],
+        "total": 18.99,
+    },
+    "BK-10103": {
+        "order_id": "BK-10103",
+        "customer_name": "Sarah Chen",
+        "email": "sarah.chen@example.com",
+        "status": "processing",
+        "order_date": _days_ago(1),
+        "delivered_date": None,
+        "tracking_number": None,
+        "items": [
+            {"title": "Tomorrow, and Tomorrow, and Tomorrow", "author": "Gabrielle Zevin", "price": 17.99, "category": "fiction"},
+        ],
+        "total": 17.99,
+    },
+    "BK-9871": {
+        "order_id": "BK-9871",
+        "customer_name": "Maria Gonzalez",
+        "email": "maria.gonzalez@example.com",
+        "status": "delivered",
+        "order_date": _days_ago(60),
+        "delivered_date": _days_ago(55),
+        "tracking_number": "1Z999AA10555555555",
+        "items": [
+            {"title": "The Midnight Library", "author": "Matt Haig", "price": 15.99, "category": "fiction"},
+        ],
+        "total": 15.99,
+    },
+}
+
+
+# Verbatim FAQ entries returned by `lookup_policy`. The agent quotes these
+# without paraphrasing.
+POLICIES: dict[str, str] = {
+    "shipping": (
+        "Standard shipping is free on orders over $25 and takes 3-5 business days. "
+        "Expedited shipping (1-2 business days) is $9.99. We ship to all 50 US states. "
+        "International shipping is not currently available."
+    ),
+    "password_reset": (
+        "To reset your password, go to bookly.com/account/login and click \"Forgot password.\" "
+        "Enter the email on your account and we will send you a reset link. "
+        "The link expires after 24 hours. If you do not receive the email, check your spam folder."
+    ),
+    "returns_overview": (
+        "You can return most items within 30 days of delivery for a full refund to your original "
+        "payment method. Items must be unread, undamaged, and in their original packaging. "
+        "Ebooks, audiobooks, gift cards, and personalized items are not returnable. "
+        "Refunds typically post within 7 business days of us receiving the return."
+    ),
+}
+
+
+# Mutated at runtime by `initiate_return`. Keyed by return_id.
+RETURNS: dict[str, dict] = {}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9b815ca
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+anthropic>=0.40.0
+fastapi>=0.115.0
+uvicorn[standard]>=0.32.0
+pydantic-settings>=2.6.0
+python-dotenv>=1.0.0
+pytest>=8.3.0
+httpx>=0.27.0
diff --git a/server.py b/server.py
new file mode 100644
index 0000000..c809f36
--- /dev/null
+++ b/server.py
@@ -0,0 +1,50 @@
+"""FastAPI app for Bookly. Hosts /api/chat, /health, and the static chat UI."""
+
+from __future__ import annotations
+
+import logging
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import RedirectResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field
+
+import agent
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
+logger = logging.getLogger("bookly.server")
+
+app = FastAPI(title="Bookly", docs_url=None, redoc_url=None)
+
+
+class ChatRequest(BaseModel):
+    session_id: str = Field(..., min_length=1, max_length=128)
+    message: str = Field(..., min_length=1, max_length=4000)
+
+
+class ChatResponse(BaseModel):
+    session_id: str
+    reply: str
+
+
+@app.get("/health")
+def health() -> dict:
+    return {"status": "ok"}
+
+
+@app.get("/")
+def root() -> RedirectResponse:
+    return RedirectResponse(url="/static/index.html")
+
+
+@app.post("/api/chat", response_model=ChatResponse)
+def chat(request: ChatRequest) -> ChatResponse:
+    try:
+        reply = agent.run_turn(request.session_id, request.message)
+    except Exception:
+        logger.exception("chat_failed session=%s", request.session_id)
+        raise HTTPException(status_code=500, detail="Something went wrong handling that message.")
+    return ChatResponse(session_id=request.session_id, reply=reply)
+
+
+app.mount("/static", StaticFiles(directory="static"), name="static")
diff --git a/static/chat.js b/static/chat.js
new file mode 100644
index 0000000..c86bfbc
--- /dev/null
+++ b/static/chat.js
@@ -0,0 +1,82 @@
+(function () {
+  "use strict";
+
+  const messagesEl = document.getElementById("messages");
+  const formEl = document.getElementById("composer");
+  const inputEl = document.getElementById("input");
+  const sendEl = document.getElementById("send");
+
+  const SESSION_KEY = "bookly_session_id";
+  let sessionId = sessionStorage.getItem(SESSION_KEY);
+  if (!sessionId) {
+    sessionId = crypto.randomUUID();
+    sessionStorage.setItem(SESSION_KEY, sessionId);
+  }
+
+  const GREETING =
+    "Hi! I'm the Bookly support assistant. I can help you check on an order, start a return, or answer questions about shipping, returns, or password reset. How can I help today?";
+
+  function appendMessage(role, text) {
+    const el = document.createElement("div");
+    el.className = "message message--" + role;
+    el.textContent = text;
+    messagesEl.appendChild(el);
+    messagesEl.scrollTop = messagesEl.scrollHeight;
+    return el;
+  }
+
+  function appendTypingIndicator() {
+    const el = document.createElement("div");
+    el.className = "message message--assistant message--typing";
+    el.setAttribute("aria-label", "Assistant is typing");
+    el.innerHTML = "<span></span><span></span><span></span>";
+    messagesEl.appendChild(el);
+    messagesEl.scrollTop = messagesEl.scrollHeight;
+    return el;
+  }
+
+  async function sendMessage(text) {
+    const response = await fetch("/api/chat", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ session_id: sessionId, message: text }),
+    });
+    if (!response.ok) {
+      throw new Error("Server returned " + response.status);
+    }
+    const data = await response.json();
+    return data.reply;
+  }
+
+  formEl.addEventListener("submit", async function (event) {
+    event.preventDefault();
+    const text = inputEl.value.trim();
+    if (!text) return;
+
+    appendMessage("user", text);
+    inputEl.value = "";
+    inputEl.disabled = true;
+    sendEl.disabled = true;
+
+    const typing = appendTypingIndicator();
+    try {
+      const reply = await sendMessage(text);
+      typing.remove();
+      appendMessage("assistant", reply);
+    } catch (err) {
+      typing.remove();
+      appendMessage(
+        "assistant",
+        "Sorry, I couldn't reach the server. Please try again in a moment."
+      );
+      console.error(err);
+    } finally {
+      inputEl.disabled = false;
+      sendEl.disabled = false;
+      inputEl.focus();
+    }
+  });
+
+  appendMessage("assistant", GREETING);
+  inputEl.focus();
+})();
diff --git a/static/index.html b/static/index.html
new file mode 100644
index 0000000..ea2067a
--- /dev/null
+++ b/static/index.html
@@ -0,0 +1,30 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Bookly Support</title>
+  <link rel="stylesheet" href="/static/style.css" />
+</head>
+<body>
+  <main class="chat">
+    <header class="chat__header">
+      <h1>Bookly Support</h1>
+      <p class="chat__subtitle">Order status, returns, and policy questions</p>
+    </header>
+    <div id="messages" class="chat__messages" aria-live="polite"></div>
+    <form id="composer" class="chat__composer" autocomplete="off">
+      <input
+        id="input"
+        class="chat__input"
+        type="text"
+        placeholder="Ask about an order or a return..."
+        maxlength="4000"
+        required
+      />
+      <button id="send" class="chat__send" type="submit">Send</button>
+    </form>
+  </main>
+  <script src="/static/chat.js"></script>
+</body>
+</html>
diff --git a/static/style.css b/static/style.css
new file mode 100644
index 0000000..c43a7a6
--- /dev/null
+++ b/static/style.css
@@ -0,0 +1,161 @@
+:root {
+  --bg: #f5f3ee;
+  --panel: #ffffff;
+  --ink: #1a1a1a;
+  --ink-muted: #6b6b6b;
+  --accent: #2e5b8a;
+  --accent-ink: #ffffff;
+  --bubble-user: #2e5b8a;
+  --bubble-user-ink: #ffffff;
+  --bubble-assistant: #ececec;
+  --bubble-assistant-ink: #1a1a1a;
+  --border: #e2ddd2;
+}
+
+* {
+  box-sizing: border-box;
+}
+
+html, body {
+  margin: 0;
+  padding: 0;
+  height: 100%;
+  background: var(--bg);
+  color: var(--ink);
+  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+  font-size: 16px;
+  line-height: 1.5;
+}
+
+.chat {
+  display: flex;
+  flex-direction: column;
+  max-width: 720px;
+  margin: 0 auto;
+  height: 100vh;
+  background: var(--panel);
+  border-left: 1px solid var(--border);
+  border-right: 1px solid var(--border);
+}
+
+.chat__header {
+  padding: 18px 24px;
+  border-bottom: 1px solid var(--border);
+  background: var(--panel);
+}
+
+.chat__header h1 {
+  margin: 0;
+  font-size: 20px;
+  font-weight: 600;
+  letter-spacing: -0.01em;
+}
+
+.chat__subtitle {
+  margin: 4px 0 0;
+  font-size: 13px;
+  color: var(--ink-muted);
+}
+
+.chat__messages {
+  flex: 1;
+  overflow-y: auto;
+  padding: 20px 24px;
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+
+.message {
+  max-width: 78%;
+  padding: 10px 14px;
+  border-radius: 16px;
+  white-space: pre-wrap;
+  word-wrap: break-word;
+  font-size: 15px;
+}
+
+.message--user {
+  align-self: flex-end;
+  background: var(--bubble-user);
+  color: var(--bubble-user-ink);
+  border-bottom-right-radius: 4px;
+}
+
+.message--assistant {
+  align-self: flex-start;
+  background: var(--bubble-assistant);
+  color: var(--bubble-assistant-ink);
+  border-bottom-left-radius: 4px;
+}
+
+.message--typing {
+  display: inline-flex;
+  gap: 4px;
+  align-items: center;
+}
+
+.message--typing span {
+  width: 6px;
+  height: 6px;
+  background: var(--ink-muted);
+  border-radius: 50%;
+  opacity: 0.4;
+  animation: typing 1.2s infinite ease-in-out;
+}
+
+.message--typing span:nth-child(2) { animation-delay: 0.15s; }
+.message--typing span:nth-child(3) { animation-delay: 0.3s; }
+
+@keyframes typing {
+  0%, 80%, 100% { opacity: 0.3; transform: translateY(0); }
+  40% { opacity: 1; transform: translateY(-2px); }
+}
+
+.chat__composer {
+  display: flex;
+  gap: 10px;
+  padding: 14px 16px;
+  border-top: 1px solid var(--border);
+  background: var(--panel);
+}
+
+.chat__input {
+  flex: 1;
+  padding: 11px 14px;
+  font-size: 15px;
+  border: 1px solid var(--border);
+  border-radius: 22px;
+  outline: none;
+  background: var(--panel);
+  color: var(--ink);
+}
+
+.chat__input:focus {
+  border-color: var(--accent);
+}
+
+.chat__send {
+  padding: 11px 22px;
+  font-size: 15px;
+  font-weight: 600;
+  color: var(--accent-ink);
+  background: var(--accent);
+  border: none;
+  border-radius: 22px;
+  cursor: pointer;
+}
+
+.chat__send:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+
+@media (max-width: 720px) {
+  .chat {
+    border: none;
+  }
+  .message {
+    max-width: 88%;
+  }
+}
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_agent.py b/tests/test_agent.py
new file mode 100644
index 0000000..6584445
--- /dev/null
+++ b/tests/test_agent.py
@@ -0,0 +1,262 @@
+"""Agent-layer tests: validate_reply (Layer 4) and run_turn end-to-end with a
+mocked Anthropic client.
+
+The Anthropic API is never called. Each test wires a fake `_client` onto the
+agent module that produces canned response objects, so the tests assert how
+the agent loop wires layers 3 and 4 together rather than what the model
+actually generates.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+# Provide a dummy API key so `from agent import ...` does not fail when
+# pydantic-settings reads .env.
+os.environ.setdefault("ANTHROPIC_API_KEY", "test-key-not-used")
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+import agent
+from agent import SAFE_FALLBACK, SESSIONS, build_system_content, run_turn, validate_reply
+
+
+# ---------------------------------------------------------------------------
+# Mock SDK objects
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MockTextBlock:
+    text: str
+    type: str = "text"
+
+
+@dataclass
+class MockToolUseBlock:
+    id: str
+    name: str
+    input: dict
+    type: str = "tool_use"
+
+
+@dataclass
+class MockResponse:
+    content: list[Any]
+    stop_reason: str = "end_turn"
+
+
+class MockClient:
+    """A scripted Anthropic client. Hands out the next response in `script`
+    each time `messages.create` is called."""
+
+    def __init__(self, script: list[MockResponse]):
+        self.script = list(script)
+        self.calls: list[dict] = []
+
+        client_self = self
+
+        class _Messages:
+            def create(self, **kwargs):
+                client_self.calls.append(kwargs)
+                if not client_self.script:
+                    raise AssertionError("MockClient ran out of scripted responses")
+                return client_self.script.pop(0)
+
+        self.messages = _Messages()
+
+
+@pytest.fixture(autouse=True)
+def _reset_sessions_and_client(monkeypatch):
+    SESSIONS.clear()
+    monkeypatch.setattr(agent, "_client", None)
+    yield
+    SESSIONS.clear()
+
+
+def _install_mock(monkeypatch, script: list[MockResponse]) -> MockClient:
+    client = MockClient(script)
+    monkeypatch.setattr(agent, "_get_client", lambda: client)
+    return client
+
+
+# ---------------------------------------------------------------------------
+# build_system_content
+# ---------------------------------------------------------------------------
+
+
+def test_build_system_content_caches_main_prompt_block():
+    blocks = build_system_content(turn_count=0)
+    assert blocks[0]["cache_control"] == {"type": "ephemeral"}
+    # Reminder block is present but uncached.
+    assert blocks[1]["text"].startswith("<reminder>")
+    assert "cache_control" not in blocks[1]
+
+
+def test_build_system_content_adds_long_conversation_reminder_after_threshold():
+    short = build_system_content(turn_count=2)
+    long = build_system_content(turn_count=5)
+    assert len(long) == len(short) + 1
+    assert "long" in long[-1]["text"].lower()
+
+
+# ---------------------------------------------------------------------------
+# validate_reply
+# ---------------------------------------------------------------------------
+
+
+def test_validate_reply_passes_clean_reply():
+    result = validate_reply("Your order BK-10042 was delivered.", [
+        {"name": "lookup_order", "result": {"order": {"order_id": "BK-10042"}}},
+    ])
+    assert result.ok
+    assert result.violations == []
+
+
+def test_validate_reply_flags_ungrounded_order_id():
+    result = validate_reply("Your order BK-99999 is on the way.", [])
+    assert not result.ok
+    assert "ungrounded_order_id:BK-99999" in result.violations
+
+
+def test_validate_reply_flags_ungrounded_date():
+    result = validate_reply("It will arrive on 2026-12-25.", [])
+    assert not result.ok
+    assert any(v.startswith("ungrounded_date:") for v in result.violations)
+
+
+def test_validate_reply_passes_grounded_date():
+    result = validate_reply("It was delivered on 2026-04-01.", [
+        {"name": "lookup_order", "result": {"order": {"delivered_date": "2026-04-01"}}},
+    ])
+    assert result.ok
+
+
+def test_validate_reply_flags_markdown_bold():
+    result = validate_reply("Here are your **details**.", [])
+    assert not result.ok
+    assert "markdown_leaked" in result.violations
+
+
+def test_validate_reply_flags_markdown_bullet():
+    result = validate_reply("Items:\n- The Goldfinch\n- Sapiens", [])
+    assert not result.ok
+    assert "markdown_leaked" in result.violations
+
+
+def test_validate_reply_flags_off_topic_engagement():
+    result = validate_reply(
+        "I recommend Project Hail Mary, it's a great book.",
+        [],
+    )
+    assert not result.ok
+    assert "off_topic_engagement" in result.violations
+
+
+def test_validate_reply_allows_refusal_template_even_with_keywords():
+    reply = "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"
+    result = validate_reply(reply, [])
+    assert result.ok
+
+
+# ---------------------------------------------------------------------------
+# run_turn end-to-end with mocked client
+# ---------------------------------------------------------------------------
+
+
+def test_run_turn_returns_simple_text_reply(monkeypatch):
+    _install_mock(monkeypatch, [
+        MockResponse(content=[MockTextBlock(text="Hi! How can I help with an order today?")]),
+    ])
+    reply = run_turn("session-1", "hi there")
+    assert "How can I help" in reply
+    session = SESSIONS["session-1"]
+    assert session.turn_count == 1
+    assert session.history[-1]["role"] == "assistant"
+
+
+def test_run_turn_with_tool_use_loop(monkeypatch):
+    """Two-step loop: model asks for a tool, then produces a final reply."""
+    first = MockResponse(
+        stop_reason="tool_use",
+        content=[
+            MockToolUseBlock(
+                id="toolu_1",
+                name="lookup_order",
+                input={"order_id": "BK-10042"},
+            )
+        ],
+    )
+    second = MockResponse(
+        content=[MockTextBlock(text="Your order BK-10042 was delivered.")],
+    )
+    client = _install_mock(monkeypatch, [first, second])
+    reply = run_turn("session-2", "Where is BK-10042?")
+    assert "BK-10042" in reply
+    assert len(client.calls) == 2
+    # History must contain: user, assistant(tool_use), user(tool_result), assistant(text)
+    history = SESSIONS["session-2"].history
+    assert history[0]["role"] == "user"
+    assert history[1]["role"] == "assistant"
+    assert history[2]["role"] == "user"  # tool_result is a user-role message
+    assert history[3]["role"] == "assistant"
+
+
+def test_run_turn_drops_hallucinated_reply_and_returns_safe_fallback(monkeypatch):
+    """A reply that mentions an order ID never seen by a tool must trigger
+    SAFE_FALLBACK, and the bad reply must not be appended to history."""
+    _install_mock(monkeypatch, [
+        MockResponse(content=[MockTextBlock(text="Your order BK-99999 will arrive on 2026-12-25.")]),
+    ])
+    reply = run_turn("session-3", "where is my order")
+    assert reply == SAFE_FALLBACK
+    history = SESSIONS["session-3"].history
+    # Only the user message should be in history; no hallucinated assistant.
+    assert len(history) == 1
+    assert history[0]["role"] == "user"
+
+
+def test_run_turn_passes_through_refusal_template(monkeypatch):
+    refusal = "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"
+    _install_mock(monkeypatch, [
+        MockResponse(content=[MockTextBlock(text=refusal)]),
+    ])
+    reply = run_turn("session-4", "recommend a mystery novel")
+    assert reply == refusal
+    assert SESSIONS["session-4"].turn_count == 1
+
+
+def test_run_turn_layer_3_blocks_initiate_return_without_eligibility(monkeypatch):
+    """If the model jumps straight to initiate_return, the tool refuses with
+    eligibility_not_verified, and the model can recover on the next iteration.
+
+    Here we script a model that immediately calls initiate_return, then on the
+    follow-up produces a clean text reply that quotes the error message.
+    """
+    first = MockResponse(
+        stop_reason="tool_use",
+        content=[
+            MockToolUseBlock(
+                id="toolu_1",
+                name="initiate_return",
+                input={
+                    "order_id": "BK-10042",
+                    "customer_email": "sarah.chen@example.com",
+                    "reason": "test",
+                },
+            )
+        ],
+    )
+    second = MockResponse(
+        content=[MockTextBlock(text="I need to check return eligibility first. Could you confirm the email on the order?")],
+    )
+    _install_mock(monkeypatch, [first, second])
+    reply = run_turn("session-5", "return BK-10042")
+    assert "eligibility" in reply.lower() or "email" in reply.lower()
+    # Verify the tool actually refused: nothing should be in returns_initiated.
+    session = SESSIONS["session-5"]
+    assert "BK-10042" not in session.guard_state.returns_initiated
diff --git a/tests/test_tools.py b/tests/test_tools.py
new file mode 100644
index 0000000..2b22ba8
--- /dev/null
+++ b/tests/test_tools.py
@@ -0,0 +1,183 @@
+"""Tool-handler tests covering Layer 3 enforcement and the privacy boundary.
+
+Goal: verify that the tools, on their own, refuse the unsafe operations even
+if the model ignores every system-prompt rule. The model never appears in
+these tests — only the deterministic handlers and the per-session guard state.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import pytest
+
+from mock_data import POLICIES, RETURNS
+from tools import SessionGuardState, dispatch_tool
+
+
+@pytest.fixture(autouse=True)
+def _reset_returns():
+    RETURNS.clear()
+    yield
+    RETURNS.clear()
+
+
+@pytest.fixture
+def state() -> SessionGuardState:
+    return SessionGuardState()
+
+
+def test_lookup_order_returns_order_for_known_id(state):
+    result = dispatch_tool("lookup_order", {"order_id": "BK-10042"}, state)
+    assert "order" in result
+    assert result["order"]["customer_name"] == "Sarah Chen"
+
+
+def test_lookup_order_unknown_id_returns_not_found(state):
+    result = dispatch_tool("lookup_order", {"order_id": "BK-99999"}, state)
+    assert result.get("error") == "order_not_found"
+
+
+def test_lookup_order_email_mismatch_masquerades_as_not_found(state):
+    """Privacy: a wrong email must look identical to a missing order so
+    callers cannot enumerate which IDs exist on the system."""
+    result = dispatch_tool(
+        "lookup_order",
+        {"order_id": "BK-10042", "customer_email": "wrong@example.com"},
+        state,
+    )
+    assert result.get("error") == "order_not_found"
+
+
+def test_lookup_order_email_match_returns_order(state):
+    result = dispatch_tool(
+        "lookup_order",
+        {"order_id": "BK-10042", "customer_email": "Sarah.Chen@example.com"},
+        state,
+    )
+    assert "order" in result
+
+
+def test_eligibility_check_passes_for_recent_delivered_order(state):
+    result = dispatch_tool(
+        "check_return_eligibility",
+        {"order_id": "BK-10042", "customer_email": "sarah.chen@example.com"},
+        state,
+    )
+    assert result["eligible"] is True
+    assert "BK-10042" in state.eligibility_checks_passed
+
+
+def test_eligibility_check_rejects_past_window(state):
+    result = dispatch_tool(
+        "check_return_eligibility",
+        {"order_id": "BK-9871", "customer_email": "maria.gonzalez@example.com"},
+        state,
+    )
+    assert result["eligible"] is False
+    assert "BK-9871" not in state.eligibility_checks_passed
+    assert "30-day" in result["reason"]
+
+
+def test_eligibility_check_rejects_not_yet_delivered(state):
+    result = dispatch_tool(
+        "check_return_eligibility",
+        {"order_id": "BK-10089", "customer_email": "james.murphy@example.com"},
+        state,
+    )
+    assert result["eligible"] is False
+    assert "shipped" in result["reason"]
+
+
+def test_eligibility_check_email_mismatch_returns_auth_failed(state):
+    result = dispatch_tool(
+        "check_return_eligibility",
+        {"order_id": "BK-10042", "customer_email": "wrong@example.com"},
+        state,
+    )
+    assert result.get("error") == "auth_failed"
+
+
+def test_initiate_return_refuses_without_prior_eligibility_check(state):
+    """Layer 3 protocol guard: the most important guardrail in the system."""
+    result = dispatch_tool(
+        "initiate_return",
+        {
+            "order_id": "BK-10042",
+            "customer_email": "sarah.chen@example.com",
+            "reason": "Bought by mistake",
+        },
+        state,
+    )
+    assert result.get("error") == "eligibility_not_verified"
+    assert not RETURNS
+
+
+def test_initiate_return_succeeds_after_eligibility_check(state):
+    dispatch_tool(
+        "check_return_eligibility",
+        {"order_id": "BK-10042", "customer_email": "sarah.chen@example.com"},
+        state,
+    )
+    result = dispatch_tool(
+        "initiate_return",
+        {
+            "order_id": "BK-10042",
+            "customer_email": "sarah.chen@example.com",
+            "reason": "Bought by mistake",
+        },
+        state,
+    )
+    assert "return_id" in result
+    assert result["return_id"].startswith("RMA-")
+    assert "BK-10042" in state.returns_initiated
+    assert result["return_id"] in RETURNS
+
+
+def test_initiate_return_refuses_duplicate(state):
+    dispatch_tool(
+        "check_return_eligibility",
+        {"order_id": "BK-10042", "customer_email": "sarah.chen@example.com"},
+        state,
+    )
+    dispatch_tool(
+        "initiate_return",
+        {
+            "order_id": "BK-10042",
+            "customer_email": "sarah.chen@example.com",
+            "reason": "Bought by mistake",
+        },
+        state,
+    )
+    second = dispatch_tool(
+        "initiate_return",
+        {
+            "order_id": "BK-10042",
+            "customer_email": "sarah.chen@example.com",
+            "reason": "Bought by mistake",
+        },
+        state,
+    )
+    assert second.get("error") == "already_initiated"
+
+
+def test_lookup_policy_returns_verbatim_text(state):
+    result = dispatch_tool("lookup_policy", {"topic": "password_reset"}, state)
+    assert result["text"] == POLICIES["password_reset"]
+
+
+def test_lookup_policy_unknown_topic_returns_not_supported(state):
+    result = dispatch_tool("lookup_policy", {"topic": "loyalty_program"}, state)
+    assert result.get("error") == "topic_not_supported"
+    assert "shipping" in result["available_topics"]
+
+
+def test_lookup_policy_topic_is_case_insensitive(state):
+    result = dispatch_tool("lookup_policy", {"topic": "SHIPPING"}, state)
+    assert result["text"] == POLICIES["shipping"]
+
+
+def test_dispatch_unknown_tool_returns_error(state):
+    result = dispatch_tool("delete_account", {}, state)
+    assert result.get("error") == "unknown_tool"
diff --git a/tools.py b/tools.py
new file mode 100644
index 0000000..c0bf334
--- /dev/null
+++ b/tools.py
@@ -0,0 +1,295 @@
+"""Tool schemas, dispatch, and Layer 3 (tool-side) guardrail enforcement.
+
+Each tool has an Anthropic-format schema (used in the `tools` argument to
+`messages.create`) and a handler. Handlers are pure functions of (args, state),
+so they are trivial to unit test and the only mutable state lives in
+`SessionGuardState` and the module-level `RETURNS` dict.
+
+The most important guardrail in the whole system lives here:
+`handle_initiate_return` refuses unless `check_return_eligibility` has already
+succeeded for the same order in the same session. This protects against the
+agent skipping the protocol even if the system prompt is ignored entirely.
+"""
+
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass, field
+from datetime import date
+from typing import Any, Callable
+
+from mock_data import ORDERS, POLICIES, RETURN_POLICY, RETURNS, TODAY
+
+
+@dataclass
+class SessionGuardState:
+    """Per-session protocol state used to enforce tool ordering rules.
+
+    Sessions are short-lived chats, so plain in-memory sets are fine. A
+    production deployment would back this with a session store.
+    """
+
+    eligibility_checks_passed: set[str] = field(default_factory=set)
+    returns_initiated: set[str] = field(default_factory=set)
+
+
+# ---------------------------------------------------------------------------
+# Tool schemas (Anthropic format)
+# ---------------------------------------------------------------------------
+
+TOOL_SCHEMAS: list[dict[str, Any]] = [
+    {
+        "name": "lookup_order",
+        "description": (
+            "Look up the status and details of a Bookly order by order ID. "
+            "Optionally pass the customer email to verify ownership before returning details. "
+            "Use this whenever the customer asks about an order."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "order_id": {
+                    "type": "string",
+                    "description": "The order ID, formatted as 'BK-' followed by digits.",
+                },
+                "customer_email": {
+                    "type": "string",
+                    "description": "Optional email used to verify the customer owns the order.",
+                },
+            },
+            "required": ["order_id"],
+        },
+    },
+    {
+        "name": "check_return_eligibility",
+        "description": (
+            "Check whether an order is eligible for return. Requires both order ID and the email "
+            "on the order. Must be called and succeed before initiate_return."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "order_id": {"type": "string"},
+                "customer_email": {"type": "string"},
+            },
+            "required": ["order_id", "customer_email"],
+        },
+    },
+    {
+        "name": "initiate_return",
+        "description": (
+            "Start a return for an order. Only call this after check_return_eligibility has "
+            "succeeded for the same order in this conversation, and after the customer has "
+            "confirmed they want to proceed."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "order_id": {"type": "string"},
+                "customer_email": {"type": "string"},
+                "reason": {
+                    "type": "string",
+                    "description": "The customer's stated reason for the return.",
+                },
+                "item_titles": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional list of specific item titles to return. Defaults to all items.",
+                },
+            },
+            "required": ["order_id", "customer_email", "reason"],
+        },
+    },
+    {
+        "name": "lookup_policy",
+        "description": (
+            "Look up a Bookly customer policy by topic. Use this whenever the customer asks "
+            "about shipping, password reset, returns overview, or similar standard policies. "
+            "Returns the verbatim policy text or topic_not_supported."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "topic": {
+                    "type": "string",
+                    "description": "Policy topic, e.g. 'shipping', 'password_reset', 'returns_overview'.",
+                },
+            },
+            "required": ["topic"],
+        },
+    },
+]
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _emails_match(a: str | None, b: str | None) -> bool:
+    if a is None or b is None:
+        return False
+    return a.strip().lower() == b.strip().lower()
+
+
+def _is_within_return_window(delivered_date: str | None) -> tuple[bool, int | None]:
+    """Return (within_window, days_since_delivery)."""
+    if delivered_date is None:
+        return (False, None)
+    delivered = date.fromisoformat(delivered_date)
+    days_since = (TODAY - delivered).days
+    return (days_since <= RETURN_POLICY["window_days"], days_since)
+
+
+# ---------------------------------------------------------------------------
+# Handlers
+# ---------------------------------------------------------------------------
+
+
+def handle_lookup_order(args: dict, state: SessionGuardState) -> dict:
+    order_id = args.get("order_id")
+    customer_email = args.get("customer_email")
+    assert isinstance(order_id, str) and order_id, "order_id is required"
+
+    order = ORDERS.get(order_id)
+    if order is None:
+        return {"error": "order_not_found", "message": f"No order found with ID {order_id}."}
+
+    # Privacy: when an email is supplied and does not match, return the same
+    # error as a missing order so callers cannot enumerate which IDs exist.
+    if customer_email is not None and not _emails_match(customer_email, order["email"]):
+        return {"error": "order_not_found", "message": f"No order found with ID {order_id}."}
+
+    return {"order": order}
+
+
+def handle_check_return_eligibility(args: dict, state: SessionGuardState) -> dict:
+    order_id = args.get("order_id")
+    customer_email = args.get("customer_email")
+    assert isinstance(order_id, str) and order_id, "order_id is required"
+    assert isinstance(customer_email, str) and customer_email, "customer_email is required"
+
+    order = ORDERS.get(order_id)
+    if order is None or not _emails_match(customer_email, order["email"]):
+        return {
+            "error": "auth_failed",
+            "message": "Could not verify that order ID and email together. Please double-check both.",
+        }
+
+    if order["status"] != "delivered":
+        return {
+            "eligible": False,
+            "reason": (
+                f"This order has status '{order['status']}', not 'delivered'. "
+                "Returns can only be started after an order has been delivered."
+            ),
+            "policy": RETURN_POLICY,
+        }
+
+    within_window, days_since = _is_within_return_window(order.get("delivered_date"))
+    if not within_window:
+        return {
+            "eligible": False,
+            "reason": (
+                f"This order was delivered {days_since} days ago, which is outside the "
+                f"{RETURN_POLICY['window_days']}-day return window."
+            ),
+            "policy": RETURN_POLICY,
+        }
+
+    state.eligibility_checks_passed.add(order_id)
+    return {
+        "eligible": True,
+        "reason": (
+            f"Order delivered {days_since} days ago, within the "
+            f"{RETURN_POLICY['window_days']}-day window."
+        ),
+        "items": order["items"],
+        "policy": RETURN_POLICY,
+    }
+
+
+def handle_initiate_return(args: dict, state: SessionGuardState) -> dict:
+    order_id = args.get("order_id")
+    customer_email = args.get("customer_email")
+    reason = args.get("reason")
+    item_titles = args.get("item_titles")
+    assert isinstance(order_id, str) and order_id, "order_id is required"
+    assert isinstance(customer_email, str) and customer_email, "customer_email is required"
+    assert isinstance(reason, str) and reason, "reason is required"
+
+    # Layer 3 protocol guard: the agent must have called check_return_eligibility
+    # for this exact order in this session, and it must have passed.
+    if order_id not in state.eligibility_checks_passed:
+        return {
+            "error": "eligibility_not_verified",
+            "message": (
+                "Cannot initiate a return without a successful eligibility check for this "
+                "order in the current session. Call check_return_eligibility first."
+            ),
+        }
+
+    if order_id in state.returns_initiated:
+        return {
+            "error": "already_initiated",
+            "message": "A return has already been initiated for this order in this session.",
+        }
+
+    order = ORDERS.get(order_id)
+    # If the order disappeared between eligibility check and now, fail loudly.
+    if order is None or not _emails_match(customer_email, order["email"]):
+        return {"error": "auth_failed", "message": "Order/email mismatch."}
+
+    titles = item_titles or [item["title"] for item in order["items"]]
+    return_id = f"RMA-{uuid.uuid4().hex[:8].upper()}"
+    record = {
+        "return_id": return_id,
+        "order_id": order_id,
+        "customer_email": order["email"],
+        "items": titles,
+        "reason": reason,
+        "refund_method": RETURN_POLICY["refund_method"],
+        "refund_timeline_days": RETURN_POLICY["refund_timeline_days"],
+        "next_steps": (
+            "We've emailed a prepaid shipping label to the address on file. Drop the package at "
+            "any carrier location within 14 days. Your refund will post within "
+            f"{RETURN_POLICY['refund_timeline_days']} business days of us receiving the return."
+        ),
+    }
+    RETURNS[return_id] = record
+    state.returns_initiated.add(order_id)
+    return record
+
+
+def handle_lookup_policy(args: dict, state: SessionGuardState) -> dict:
+    topic = args.get("topic")
+    assert isinstance(topic, str) and topic, "topic is required"
+
+    text = POLICIES.get(topic.strip().lower())
+    if text is None:
+        return {
+            "error": "topic_not_supported",
+            "message": f"No policy entry for topic '{topic}'.",
+            "available_topics": sorted(POLICIES.keys()),
+        }
+    return {"topic": topic, "text": text}
+
+
+# ---------------------------------------------------------------------------
+# Dispatch
+# ---------------------------------------------------------------------------
+
+
+_HANDLERS: dict[str, Callable[[dict, SessionGuardState], dict]] = {
+    "lookup_order": handle_lookup_order,
+    "check_return_eligibility": handle_check_return_eligibility,
+    "initiate_return": handle_initiate_return,
+    "lookup_policy": handle_lookup_policy,
+}
+
+
+def dispatch_tool(name: str, args: dict, state: SessionGuardState) -> dict:
+    handler = _HANDLERS.get(name)
+    if handler is None:
+        return {"error": "unknown_tool", "message": f"No tool named {name}."}
+    return handler(args, state)