package runtime_test

import (
	"context"
	"path/filepath"
	"strings"
	"sync"
	"testing"
	"time"

	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/envelope"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/event"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/orchestrator"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/runtime"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/runtime/scripted"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/store"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/transport"
)

// TestPhaseA_MisbehavingLLM_AllFiveClasses is the Phase A done-criteria smoke
// test the plan §17 specifies: a "real LLM" in the harness scripted to
// misbehave in 5 ways, with the harness containing each.
//
// PLAN DEVIATION: the plan called for a real HeadlessClaude. We use a
// scripted runtime here so the test is deterministic, free, and CI-runnable.
// Real-Claude path is wired in headless/ and gated on ANTHROPIC_API_KEY (see
// headless_smoke_test.go). The harness containment behavior is identical
// because the runtime interface hides the provider; what changes between
// scripted and real is only the source of misbehavior.
func TestPhaseA_MisbehavingLLM_AllFiveClasses(t *testing.T) {
	tmp := t.TempDir()
	st, err := store.Open(filepath.Join(tmp, "harness.db"))
	if err != nil {
		t.Fatal(err)
	}
	defer st.Close()

	bus := event.NewBus(st)
	q := transport.New(st, bus)
	orch := orchestrator.New(st, bus)
	rt := scripted.New()
	ctx := context.Background()

	// Subscribe to events to verify each misbehavior class is contained with a
	// recognizable signal.
	sub, cancelSub := bus.Subscribe(256)
	defer cancelSub()
	gotEvents := make(map[event.Kind][]event.Event)
	var gotMu sync.Mutex
	done := make(chan struct{})
	go func() {
		defer close(done)
		for ev := range sub {
			gotMu.Lock()
			gotEvents[ev.Kind] = append(gotEvents[ev.Kind], ev)
			gotMu.Unlock()
		}
	}()
	// snapshot returns a thread-safe copy of the events for a kind.
	snapshot := func(k event.Kind) []event.Event {
		gotMu.Lock()
		defer gotMu.Unlock()
		out := make([]event.Event, len(gotEvents[k]))
		copy(out, gotEvents[k])
		return out
	}

	_ = orch.CreateRun(ctx, "run-misbehave", "smoke")

	// Helper to spawn one agent into a fresh DB row with a specific zone & toolset.
	spawn := func(id string, zone []string, tools []string, script []*runtime.LLMResponse) *runtime.Agent {
		_, _ = st.DB().Exec(
			`INSERT INTO agents(id, run_id, status, spawned_at, heartbeat_at)
			 VALUES(?, 'run-misbehave', 'running', ?, ?)`,
			id, store.FmtTime(store.Now()), store.FmtTime(store.Now()),
		)
		rt.SetScript(id, script)
		_, _ = rt.Spawn(ctx, runtime.SpawnSpec{AgentID: id, RunID: "run-misbehave", Tools: tools, ZoneScope: zone})
		return runtime.NewAgent(rt, st, bus, q, id, "tester", "run-misbehave", zone, tools, "scripted", 1.0)
	}

	// ---- Misbehavior 1: refuses to call introspect ----
	a1 := spawn("a-noIntrospect",
		[]string{"client/**/*"},
		[]string{"introspect", "write_file"},
		[]*runtime.LLMResponse{
			{
				Text:   "I will not call introspect; I'll just write a file.",
				Tokens: runtime.TokenUsage{Prompt: 100, Completion: 50},
				ToolCalls: []runtime.ToolCall{
					{Name: "write_file", Args: map[string]any{"path": "client/src/foo.txt", "content": "hi"}},
				},
			},
		},
	)
	send(t, q, "a-noIntrospect", "m1")
	if _, err := a1.HandleOne(ctx); err != nil {
		t.Fatalf("a-noIntrospect.HandleOne: %v", err)
	}
	if a1.IntrospectCalls() != 0 {
		t.Errorf("expected zero introspect calls (agent refused), got %d", a1.IntrospectCalls())
	}
	// Harness containment: write_file succeeded because it was in-zone.
	// No crash. We don't *synthesize* an introspect call — the LLM gets a
	// nudge in its prompt, not a hard enforcer.

	// ---- Misbehavior 2: parse failure (malformed envelope-out) ----
	// Scripted runtime can simulate the parser's retry count via ParseFailures.
	a2 := spawn("a-parseFail",
		[]string{"client/**/*"},
		[]string{"introspect"},
		[]*runtime.LLMResponse{
			{
				Text:          "ok",
				ParseFailures: 3,
				Tokens:        runtime.TokenUsage{Prompt: 50, Completion: 25},
			},
		},
	)
	send(t, q, "a-parseFail", "m2")
	if _, err := a2.HandleOne(ctx); err != nil {
		t.Fatalf("a-parseFail.HandleOne: %v", err)
	}
	if a2.ParseFailures() != 3 {
		t.Errorf("ParseFailures = %d, want 3", a2.ParseFailures())
	}
	// Verify parse_failures persisted to agents row.
	var pf int
	_ = st.DB().QueryRow(`SELECT parse_failures FROM agents WHERE id='a-parseFail'`).Scan(&pf)
	if pf != 3 {
		t.Errorf("agents.parse_failures = %d, want 3", pf)
	}

	// ---- Misbehavior 3: prose question instead of request_clarification ----
	a3 := spawn("a-prosequest",
		[]string{"client/**/*"},
		[]string{"introspect"},
		[]*runtime.LLMResponse{
			{
				Text:   "Wait, should I use a tooltip or a popover here?",
				Tokens: runtime.TokenUsage{Prompt: 50, Completion: 25},
				// No ToolCalls: deliberately *no* request_clarification.
			},
		},
	)
	send(t, q, "a-prosequest", "m3")
	if _, err := a3.HandleOne(ctx); err != nil {
		t.Fatalf("a-prosequest.HandleOne: %v", err)
	}
	// Containment: no clarify message synthesized by the harness. The plan
	// explicitly chooses false-positive-avoidance over auto-routing.
	var clarifies int
	_ = st.DB().QueryRow(`SELECT COUNT(*) FROM messages WHERE from_agent='a-prosequest' AND type='clarify'`).Scan(&clarifies)
	if clarifies != 0 {
		t.Errorf("expected 0 clarify messages from prose-questioner, got %d", clarifies)
	}

	// ---- Misbehavior 4: out-of-zone write ----
	a4 := spawn("a-zoneViol",
		[]string{"client/**/*"},
		[]string{"write_file"},
		[]*runtime.LLMResponse{
			{
				Text:   "writing to backend (forbidden)",
				Tokens: runtime.TokenUsage{Prompt: 50, Completion: 25},
				ToolCalls: []runtime.ToolCall{
					{Name: "write_file", Args: map[string]any{"path": "server-go/main.go", "content": "evil"}},
				},
			},
		},
	)
	send(t, q, "a-zoneViol", "m4")
	if _, err := a4.HandleOne(ctx); err != nil {
		t.Fatalf("a-zoneViol.HandleOne: %v", err)
	}
	// Drain events into our map so we can check.
	time.Sleep(20 * time.Millisecond)
	var zoneViolations int
	for _, ev := range snapshot(event.KindPolicyViolation) {
		if ev.AgentID == "a-zoneViol" {
			zoneViolations++
		}
	}
	if zoneViolations == 0 {
		t.Errorf("expected policy.violation event for out-of-zone write")
	}

	// ---- Misbehavior 5: cost-ceiling overrun ----
	a5 := spawn("a-cost",
		[]string{"client/**/*"},
		[]string{"introspect"},
		[]*runtime.LLMResponse{
			{Text: "expensive", Tokens: runtime.TokenUsage{Prompt: 600_000, Completion: 400_000}},
			{Text: "more expensive", Tokens: runtime.TokenUsage{Prompt: 600_000, Completion: 400_000}},
		},
	)
	send(t, q, "a-cost", "m5a")
	send(t, q, "a-cost", "m5b")
	for i := 0; i < 2; i++ {
		_, _ = a5.HandleOne(ctx)
	}
	// At CostPerKTok=1.0 USD/1k tokens, 2M tokens = $2000. Run the cost-ceiling
	// reaper with a $5 ceiling.
	n, err := orch.EnforceCostCeiling(ctx, "", 0, 5.0)
	if err != nil {
		t.Fatal(err)
	}
	if n != 1 {
		t.Errorf("cost-ceiling killed %d runs, want 1", n)
	}
	var runStatus, killReason string
	_ = st.DB().QueryRow(`SELECT status, IFNULL(kill_reason,'') FROM runs WHERE id='run-misbehave'`).Scan(&runStatus, &killReason)
	if runStatus != "killed" || !strings.Contains(killReason, "cost") {
		t.Errorf("run status=%s reason=%s, want killed/cost*", runStatus, killReason)
	}

	cancelSub()
	<-done
}

func send(t *testing.T, q *transport.Queue, to, id string) {
	t.Helper()
	e := &envelope.Envelope{
		ID: id, RunID: "run-misbehave",
		From: "master", To: to, Type: envelope.TypeDelegate, TTLMs: 60000,
		Payload: envelope.Payload{Intent: "do thing", Expects: envelope.ExpectsReport},
	}
	if err := q.Send(context.Background(), e); err != nil {
		t.Fatal(err)
	}
}
