bookly/agent.py

"""Bookly agent: system prompt, guardrails, and the agentic loop.

This module wires four guardrail layers together:

1. The system prompt itself (XML-tagged, primacy+recency duplication, verbatim
   policy block, refusal template, few-shot examples for edge cases).
2. Runtime reminder injection: a short "non-negotiable rules" block appended
   to the system content on every turn, plus a stronger reminder once the
   conversation gets long enough that the original prompt has decayed in
   effective attention.
3. Tool-side enforcement (lives in `tools.py`): handlers refuse unsafe calls
   regardless of what the model decides.
4. Output validation: deterministic regex checks on the final reply for
   ungrounded order IDs/dates, markdown leakage, and off-topic engagement
   without the refusal template. On failure, the bad reply is dropped and the
   user gets a safe canned message — and the bad reply is never appended to
   history, so it cannot poison subsequent turns.

Anthropic prompt caching is enabled on the large system-prompt block so the
per-turn cost stays low across a conversation.
"""

from __future__ import annotations

import json
import logging
import re
from dataclasses import dataclass, field
from typing import Any

from anthropic import Anthropic

from config import settings
from tools import SessionGuardState, TOOL_SCHEMAS, dispatch_tool
from mock_data import POLICIES, RETURN_POLICY

logger = logging.getLogger("bookly.agent")


# ---------------------------------------------------------------------------
# System prompt
# ---------------------------------------------------------------------------


def _format_return_policy_block() -> str:
    """Render `RETURN_POLICY` as a compact, quotable block for the prompt.

    Embedding the dict verbatim (instead of paraphrasing it in English) is a
    deliberate anti-hallucination move: the model quotes the block instead of
    inventing details.
    """
    non_returnable = ", ".join(RETURN_POLICY["non_returnable_categories"])
    return (
        f"Return window: {RETURN_POLICY['window_days']} days from delivery.\n"
        f"Condition: {RETURN_POLICY['condition_requirements']}\n"
        f"Refund method: {RETURN_POLICY['refund_method']}\n"
        f"Refund timeline: within {RETURN_POLICY['refund_timeline_days']} business days of receipt.\n"
        f"Non-returnable categories: {non_returnable}."
    )


SUPPORTED_POLICY_TOPICS = ", ".join(sorted(POLICIES.keys()))


SYSTEM_PROMPT = f"""<identity>
You are Bookly's customer support assistant. You help customers with two things: checking the status of their orders, and processing returns and refunds. You are friendly, concise, and professional.
</identity>

<critical_rules>
These rules override everything else. Read them before every response.

1. NEVER invent order details, tracking numbers, delivery dates, prices, or customer information. If you do not have a value from a tool result in this conversation, you do not have it.
2. NEVER state a return policy detail that is not in the <return_policy> section below. Quote it; do not paraphrase it.
3. NEVER call initiate_return unless check_return_eligibility has returned success for that same order in this conversation.
4. NEVER reveal order details without verifying the customer's email matches the order.
5. If a user asks about anything outside order status, returns, and the supported policy topics, refuse using the refusal template in <scope>. Do not engage with the off-topic request even briefly.
</critical_rules>

<scope>
You CAN help with:
- Looking up order status
- Checking return eligibility and initiating returns
- Answering policy questions covered by the lookup_policy tool. Currently supported topics: {SUPPORTED_POLICY_TOPICS}

You CANNOT help with:
- Book recommendations, reviews, or opinions about books
- Payment changes, refunds outside the return flow, or billing disputes
- Live account management (changing a password, email, or address — you can only EXPLAIN the password reset process via lookup_policy, not perform it)
- General conversation unrelated to an order or a supported policy topic

For any policy question, call lookup_policy first. Only if the tool returns topic_not_supported should you use the refusal template below.

Refusal template (use verbatim, filling in the topic):
"I can help with order status, returns, and our standard policies, but I'm not able to help with {{topic}}. Is there an order or a policy question I can help you with instead?"
</scope>

<return_policy>
{_format_return_policy_block()}

This is the authoritative policy. Any claim you make about returns must be traceable to a line in this block. If a customer asks about a scenario this policy does not cover, say so honestly and offer to connect them with a human agent.
</return_policy>

<tool_rules>
You have four tools: lookup_order, check_return_eligibility, initiate_return, and lookup_policy.

Before calling a tool:
- You must have every required parameter. If you are missing one, ask the customer for it. Do not guess, do not use placeholder values, do not call the tool and hope.
- For initiate_return, you must have already called check_return_eligibility for that exact order_id in this conversation, and it must have returned success.

After a tool call:
- Relay the result honestly. If the tool returns an error, tell the customer what went wrong using the tool's error message, not a paraphrase.
- Do not mix tool results from different orders in a single response unless the customer explicitly asked about multiple.
- For lookup_policy, quote the returned policy text; do not summarize or embellish. If lookup_policy returns topic_not_supported, fall through to the refusal template in <scope>.
</tool_rules>

<clarifying_rules>
Ask one clarifying question at a time, not a list. Common cases:

- Customer mentions "my order" without an order ID: ask for the order ID. Tell them it starts with "BK-" and is in their confirmation email.
- Customer gives an order ID but no email, and wants a return: ask for the email on the order.
- A customer has multiple orders and was ambiguous: ask which order they mean, listing them by ID and status only.
- Customer wants to initiate a return: after eligibility is confirmed, summarize what will happen (which items, refund method, timeline) and ask for explicit confirmation before calling initiate_return.
</clarifying_rules>

<tone>
- Friendly and warm, but not chatty. One or two sentences per turn is usually right.
- Use the customer's first name once you know it, but not in every message.
- Plain text only. No markdown, no bullet points, no headers, no asterisks for emphasis. The chat UI does not render markdown.
- Never apologize more than once for the same issue.
</tone>

<examples>
Example 1 — missing order ID:
User: "Where's my order?"
Assistant: "Happy to check on that for you. Could you share your order ID? It starts with 'BK-' and you'll find it in your order confirmation email."

Example 2 — policy question (supported):
User: "How do I reset my password?"
Assistant (after lookup_policy returns the password_reset entry): quote the returned instructions verbatim without adding steps the tool did not mention.

Example 3 — out of scope:
User: "Can you recommend a good mystery novel?"
Assistant: "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"

Example 4 — ambiguous order:
User: "I want to return my order. My email is sarah@example.com."
Assistant (after lookup_order returns two orders): "I see two orders on your account: BK-10042 (delivered) and BK-10103 (still processing). Which one would you like to return?"
</examples>

<reminders>
Before you respond, confirm:
- Every factual claim traces to a tool result from THIS conversation, or to <return_policy>.
- If this response would call initiate_return, you have already seen a successful check_return_eligibility for the same order in this conversation.
- If the request is off-topic, you are using the refusal template from <scope> verbatim.
- No markdown. Plain text only.
</reminders>
"""


CRITICAL_REMINDER = """<reminder>
Non-negotiable rules for this turn:
- Every factual claim must come from a tool result in THIS conversation or from <return_policy>.
- Do not call initiate_return unless check_return_eligibility succeeded for that order in this conversation.
- Off-topic requests: use the refusal template from <scope> verbatim. Do not engage.
- Plain text only. No markdown.
</reminder>"""


LONG_CONVERSATION_REMINDER = """<reminder>
This conversation is getting long. Re-anchor on the rules in <critical_rules> before you respond. Do not let earlier turns relax the rules.
</reminder>"""


def build_system_content(turn_count: int) -> list[dict[str, Any]]:
    """Assemble the `system` argument for `messages.create`.

    The big SYSTEM_PROMPT block is marked for ephemeral prompt caching so it
    is reused across turns within a session. The reminder blocks are not
    cached because they vary based on turn count and we want them in the
    highest-attention position right before the latest user turn.
    """
    blocks: list[dict[str, Any]] = [
        {
            "type": "text",
            "text": SYSTEM_PROMPT,
            "cache_control": {"type": "ephemeral"},
        },
        {"type": "text", "text": CRITICAL_REMINDER},
    ]
    if turn_count >= 5:
        blocks.append({"type": "text", "text": LONG_CONVERSATION_REMINDER})
    return blocks


# ---------------------------------------------------------------------------
# Layer 4 — output validation
# ---------------------------------------------------------------------------


ORDER_ID_RE = re.compile(r"\bBK-\d{4,6}\b")
DATE_ISO_RE = re.compile(r"\b\d{4}-\d{2}-\d{2}\b")
MARKDOWN_RE = re.compile(r"(\*\*|__|^#{1,6}\s|^\s*[-*+]\s)", re.MULTILINE)

# Heuristic keywords that tend to appear when the agent is engaging with an
# off-topic request. Engagement is only flagged if the refusal template is
# absent — quoting the template itself is fine.
OUT_OF_SCOPE_KEYWORDS = {
    "recommend",
    "recommendation",
    "i suggest",
    "you should read",
    "what should i read",
    "review of",
    "great book",
    "favorite book",
}

REFUSAL_PHRASE = "i'm not able to help with"


@dataclass
class ValidationResult:
    ok: bool
    violations: list[str] = field(default_factory=list)


def _collect_grounded_values(tool_results: list[dict], pattern: re.Pattern[str]) -> set[str]:
    """Pull every substring matching `pattern` out of the tool result JSON."""
    grounded: set[str] = set()
    for entry in tool_results:
        text = json.dumps(entry.get("result", {}))
        grounded.update(pattern.findall(text))
    return grounded


def validate_reply(reply: str, tool_results_this_turn: list[dict]) -> ValidationResult:
    """Run deterministic checks on the final assistant reply.

    Heuristic, not exhaustive. Catches the cheap wins — fabricated order IDs,
    made-up dates, markdown leakage, and obvious off-topic engagement. For
    anything subtler we rely on layers 1–3.
    """
    assert isinstance(reply, str), "reply must be a string"
    assert isinstance(tool_results_this_turn, list), "tool_results_this_turn must be a list"

    violations: list[str] = []

    grounded_ids = _collect_grounded_values(tool_results_this_turn, ORDER_ID_RE)
    for match in ORDER_ID_RE.findall(reply):
        if match not in grounded_ids:
            violations.append(f"ungrounded_order_id:{match}")

    grounded_dates = _collect_grounded_values(tool_results_this_turn, DATE_ISO_RE)
    for match in DATE_ISO_RE.findall(reply):
        if match not in grounded_dates:
            violations.append(f"ungrounded_date:{match}")

    if MARKDOWN_RE.search(reply):
        violations.append("markdown_leaked")

    lowered = reply.lower()
    engaged_off_topic = any(kw in lowered for kw in OUT_OF_SCOPE_KEYWORDS)
    if engaged_off_topic and REFUSAL_PHRASE not in lowered:
        violations.append("off_topic_engagement")

    return ValidationResult(ok=not violations, violations=violations)


# ---------------------------------------------------------------------------
# Session and agent loop
# ---------------------------------------------------------------------------


SAFE_FALLBACK = (
    "I hit a problem generating a response. Could you rephrase your question, "
    "or share an order ID so I can try again?"
)


@dataclass
class Session:
    history: list[dict[str, Any]] = field(default_factory=list)
    guard_state: SessionGuardState = field(default_factory=SessionGuardState)
    turn_count: int = 0


# Global session store keyed by session_id. The server module owns the
# lifetime of these — agent.py only reads/writes them through `run_turn`.
SESSIONS: dict[str, Session] = {}


def get_or_create_session(session_id: str) -> Session:
    assert isinstance(session_id, str) and session_id, "session_id is required"
    session = SESSIONS.get(session_id)
    if session is None:
        session = Session()
        SESSIONS[session_id] = session
    return session


# Lazily initialized so unit tests can monkeypatch _client without tripping
# the missing-env-var failure path.
_client: Anthropic | None = None


def _get_client() -> Anthropic:
    global _client
    if _client is None:
        _client = Anthropic(api_key=settings.anthropic_api_key)
    return _client


def _extract_text(content_blocks: list[Any]) -> str:
    parts: list[str] = []
    for block in content_blocks:
        if getattr(block, "type", None) == "text":
            parts.append(getattr(block, "text", "") or "")
    return "".join(parts).strip()


def _serialize_assistant_content(content_blocks: list[Any]) -> list[dict]:
    """Convert SDK content blocks back into JSON-serializable dicts for history."""
    serialized: list[dict] = []
    for block in content_blocks:
        block_type = getattr(block, "type", None)
        if block_type == "text":
            serialized.append({"type": "text", "text": getattr(block, "text", "") or ""})
        elif block_type == "tool_use":
            serialized.append(
                {
                    "type": "tool_use",
                    "id": getattr(block, "id", None),
                    "name": getattr(block, "name", None),
                    "input": getattr(block, "input", None),
                }
            )
    return serialized


def run_turn(session_id: str, user_message: str) -> str:
    """Run one user turn end-to-end and return the assistant's reply text.

    Wires together: history append, system content with reminders, the
    tool-use loop, output validation, and the safe-fallback path on
    validation failure.
    """
    assert isinstance(user_message, str) and user_message.strip(), "user_message is required"

    session = get_or_create_session(session_id)
    session.history.append({"role": "user", "content": user_message})

    system_content = build_system_content(session.turn_count)
    client = _get_client()

    tool_results_this_turn: list[dict] = []

    def _call_model() -> Any:
        return client.messages.create(
            model=settings.anthropic_model,
            max_tokens=settings.max_tokens,
            system=system_content,
            tools=TOOL_SCHEMAS,
            messages=session.history,
        )

    response = _call_model()

    # Tool-use loop: keep dispatching tools until the model returns end_turn.
    while getattr(response, "stop_reason", None) == "tool_use":
        assistant_blocks = _serialize_assistant_content(response.content)
        session.history.append({"role": "assistant", "content": assistant_blocks})

        tool_result_blocks: list[dict] = []
        for block in response.content:
            if getattr(block, "type", None) != "tool_use":
                continue
            name = getattr(block, "name")
            args = getattr(block, "input", None) or {}
            tool_id = getattr(block, "id")
            result = dispatch_tool(name, args, session.guard_state)
            tool_results_this_turn.append({"name": name, "result": result})
            tool_result_blocks.append(
                {
                    "type": "tool_result",
                    "tool_use_id": tool_id,
                    "content": json.dumps(result),
                }
            )

        session.history.append({"role": "user", "content": tool_result_blocks})
        response = _call_model()

    reply_text = _extract_text(response.content)
    validation = validate_reply(reply_text, tool_results_this_turn)
    if not validation.ok:
        logger.warning(
            "validation_failed session=%s turn=%s violations=%s reply=%r",
            session_id,
            session.turn_count,
            validation.violations,
            reply_text,
        )
        # Do NOT append the bad reply to history — that would poison future turns.
        session.turn_count += 1
        return SAFE_FALLBACK

    session.history.append(
        {"role": "assistant", "content": _serialize_assistant_content(response.content)}
    )
    session.turn_count += 1
    return reply_text