bookly/tests/test_agent.py

"""Agent-layer tests: validate_reply (Layer 4) and run_turn end-to-end with a
mocked Anthropic client.

The Anthropic API is never called. Each test wires a fake `_client` onto the
agent module that produces canned response objects, so the tests assert how
the agent loop wires layers 3 and 4 together rather than what the model
actually generates.
"""

import os
import sys

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

# Provide a dummy API key so `from agent import ...` does not fail when
# pydantic-settings reads .env.
os.environ.setdefault("ANTHROPIC_API_KEY", "test-key-not-used")

from dataclasses import dataclass, field
from typing import Any

import pytest

import agent
from agent import SAFE_FALLBACK, SESSIONS, build_system_content, run_turn, validate_reply


# ---------------------------------------------------------------------------
# Mock SDK objects
# ---------------------------------------------------------------------------


@dataclass
class MockTextBlock:
    text: str
    type: str = "text"


@dataclass
class MockToolUseBlock:
    id: str
    name: str
    input: dict
    type: str = "tool_use"


@dataclass
class MockResponse:
    content: list[Any]
    stop_reason: str = "end_turn"


class MockClient:
    """A scripted Anthropic client. Hands out the next response in `script`
    each time `messages.create` is called."""

    def __init__(self, script: list[MockResponse]):
        self.script = list(script)
        self.calls: list[dict] = []

        client_self = self

        class _Messages:
            def create(self, **kwargs):
                client_self.calls.append(kwargs)
                if not client_self.script:
                    raise AssertionError("MockClient ran out of scripted responses")
                return client_self.script.pop(0)

        self.messages = _Messages()


@pytest.fixture(autouse=True)
def _reset_sessions_and_client(monkeypatch):
    SESSIONS.clear()
    monkeypatch.setattr(agent, "_client", None)
    yield
    SESSIONS.clear()


def _install_mock(monkeypatch, script: list[MockResponse]) -> MockClient:
    client = MockClient(script)
    monkeypatch.setattr(agent, "_get_client", lambda: client)
    return client


# ---------------------------------------------------------------------------
# build_system_content
# ---------------------------------------------------------------------------


def test_build_system_content_caches_main_prompt_block():
    blocks = build_system_content(turn_count=0)
    assert blocks[0]["cache_control"] == {"type": "ephemeral"}
    # Reminder block is present but uncached.
    assert blocks[1]["text"].startswith("<reminder>")
    assert "cache_control" not in blocks[1]


def test_build_system_content_adds_long_conversation_reminder_after_threshold():
    short = build_system_content(turn_count=2)
    long = build_system_content(turn_count=5)
    assert len(long) == len(short) + 1
    assert "long" in long[-1]["text"].lower()


# ---------------------------------------------------------------------------
# validate_reply
# ---------------------------------------------------------------------------


def test_validate_reply_passes_clean_reply():
    result = validate_reply("Your order BK-10042 was delivered.", [
        {"name": "lookup_order", "result": {"order": {"order_id": "BK-10042"}}},
    ])
    assert result.ok
    assert result.violations == []


def test_validate_reply_flags_ungrounded_order_id():
    result = validate_reply("Your order BK-99999 is on the way.", [])
    assert not result.ok
    assert "ungrounded_order_id:BK-99999" in result.violations


def test_validate_reply_flags_ungrounded_date():
    result = validate_reply("It will arrive on 2026-12-25.", [])
    assert not result.ok
    assert any(v.startswith("ungrounded_date:") for v in result.violations)


def test_validate_reply_passes_grounded_date():
    result = validate_reply("It was delivered on 2026-04-01.", [
        {"name": "lookup_order", "result": {"order": {"delivered_date": "2026-04-01"}}},
    ])
    assert result.ok


def test_validate_reply_flags_markdown_bold():
    result = validate_reply("Here are your **details**.", [])
    assert not result.ok
    assert "markdown_leaked" in result.violations


def test_validate_reply_flags_markdown_bullet():
    result = validate_reply("Items:\n- The Goldfinch\n- Sapiens", [])
    assert not result.ok
    assert "markdown_leaked" in result.violations


def test_validate_reply_flags_off_topic_engagement():
    result = validate_reply(
        "I recommend Project Hail Mary, it's a great book.",
        [],
    )
    assert not result.ok
    assert "off_topic_engagement" in result.violations


def test_validate_reply_allows_refusal_template_even_with_keywords():
    reply = "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"
    result = validate_reply(reply, [])
    assert result.ok


# ---------------------------------------------------------------------------
# run_turn end-to-end with mocked client
# ---------------------------------------------------------------------------


def test_run_turn_returns_simple_text_reply(monkeypatch):
    _install_mock(monkeypatch, [
        MockResponse(content=[MockTextBlock(text="Hi! How can I help with an order today?")]),
    ])
    reply = run_turn("session-1", "hi there")
    assert "How can I help" in reply
    session = SESSIONS["session-1"]
    assert session.turn_count == 1
    assert session.history[-1]["role"] == "assistant"


def test_run_turn_with_tool_use_loop(monkeypatch):
    """Two-step loop: model asks for a tool, then produces a final reply."""
    first = MockResponse(
        stop_reason="tool_use",
        content=[
            MockToolUseBlock(
                id="toolu_1",
                name="lookup_order",
                input={"order_id": "BK-10042"},
            )
        ],
    )
    second = MockResponse(
        content=[MockTextBlock(text="Your order BK-10042 was delivered.")],
    )
    client = _install_mock(monkeypatch, [first, second])
    reply = run_turn("session-2", "Where is BK-10042?")
    assert "BK-10042" in reply
    assert len(client.calls) == 2
    # History must contain: user, assistant(tool_use), user(tool_result), assistant(text)
    history = SESSIONS["session-2"].history
    assert history[0]["role"] == "user"
    assert history[1]["role"] == "assistant"
    assert history[2]["role"] == "user"  # tool_result is a user-role message
    assert history[3]["role"] == "assistant"


def test_run_turn_drops_hallucinated_reply_and_returns_safe_fallback(monkeypatch):
    """A reply that mentions an order ID never seen by a tool must trigger
    SAFE_FALLBACK, and the bad reply must not be appended to history."""
    _install_mock(monkeypatch, [
        MockResponse(content=[MockTextBlock(text="Your order BK-99999 will arrive on 2026-12-25.")]),
    ])
    reply = run_turn("session-3", "where is my order")
    assert reply == SAFE_FALLBACK
    history = SESSIONS["session-3"].history
    # Only the user message should be in history; no hallucinated assistant.
    assert len(history) == 1
    assert history[0]["role"] == "user"


def test_run_turn_passes_through_refusal_template(monkeypatch):
    refusal = "I can help with order status, returns, and our standard policies, but I'm not able to help with book recommendations. Is there an order or a policy question I can help you with instead?"
    _install_mock(monkeypatch, [
        MockResponse(content=[MockTextBlock(text=refusal)]),
    ])
    reply = run_turn("session-4", "recommend a mystery novel")
    assert reply == refusal
    assert SESSIONS["session-4"].turn_count == 1


def test_run_turn_layer_3_blocks_initiate_return_without_eligibility(monkeypatch):
    """If the model jumps straight to initiate_return, the tool refuses with
    eligibility_not_verified, and the model can recover on the next iteration.

    Here we script a model that immediately calls initiate_return, then on the
    follow-up produces a clean text reply that quotes the error message.
    """
    first = MockResponse(
        stop_reason="tool_use",
        content=[
            MockToolUseBlock(
                id="toolu_1",
                name="initiate_return",
                input={
                    "order_id": "BK-10042",
                    "customer_email": "sarah.chen@example.com",
                    "reason": "test",
                },
            )
        ],
    )
    second = MockResponse(
        content=[MockTextBlock(text="I need to check return eligibility first. Could you confirm the email on the order?")],
    )
    _install_mock(monkeypatch, [first, second])
    reply = run_turn("session-5", "return BK-10042")
    assert "eligibility" in reply.lower() or "email" in reply.lower()
    # Verify the tool actually refused: nothing should be in returns_initiated.
    session = SESSIONS["session-5"]
    assert "BK-10042" not in session.guard_state.returns_initiated