package orchestrator

import (
	"context"
	"crypto/sha256"
	"encoding/hex"
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strings"
	"sync"
	"time"

	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/event"
	"github.com/flothus/tmux-xterm-research/server-go/internal/configsvc"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/org"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/runtime"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/store"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/transport"
)

func spawnContextJSON(m map[string]any) []byte {
	b, _ := json.MarshalIndent(m, "", "  ")
	return b
}

// thin local aliases keep resolveRolePath compact.
var (
	osStat       = os.Stat
	filepathDir  = filepath.Dir
	filepathJoin = filepath.Join
)

// OrgRunner walks an org.Definition and turns it into live agents bound to a
// run. Plan §17 Phase N.
//
// Provider routing: each role's `provider` string picks a Runtime impl from
// the Runtimes map. Common providers we recognize:
//   - "scripted"     — internal/harness/runtime/scripted (tests)
//   - "claude-code"  — internal/harness/runtime/tmux + claudecode adapter
//   - "anthropic"    — internal/harness/runtime/headless
//   - "codex" / "gemini" — future tmux adapters
//
// Unknown providers fall back to the "default" runtime if present, else error.
type OrgRunner struct {
	Orch     *Orchestrator
	Queue    *transport.Queue
	Runtimes map[string]runtime.Runtime
	Def      *org.Definition

	// ProjectRoot anchors agent file writes. Passed to every Agent so
	// relative paths emitted by tools resolve to the same place regardless
	// of the calling process's CWD.
	ProjectRoot string

	// CostPerKTokDefault is the default cost factor for cost-accounting roles
	// (mostly headless/API). Subscription roles ignore this.
	CostPerKTokDefault float64

	// ConfigSvc is the org/role builder façade. When non-nil, every
	// spawned Agent gets it installed so org.* / role.* tool calls work.
	// OC4: same Service as the HTTP /api/config endpoints.
	ConfigSvc *configsvc.Service

	mu     sync.Mutex
	agents map[string]*Spawned // roleID → spawned record (ID + live Agent)
	// V79: per-role inflight markers so two concurrent SpawnRole calls
	// for the same role serialise; the second caller waits for the first
	// instead of starting a duplicate spawn pipeline.
	inflight map[string]chan struct{}
}

// NewOrgRunner returns a runner.
func NewOrgRunner(orch *Orchestrator, q *transport.Queue, runtimes map[string]runtime.Runtime, def *org.Definition) *OrgRunner {
	return &OrgRunner{
		Orch: orch, Queue: q, Runtimes: runtimes, Def: def,
		CostPerKTokDefault: 0.005,
		agents:             map[string]*Spawned{},
		inflight:           map[string]chan struct{}{},
	}
}

// SpawnAll instantiates every declared role as an agent bound to runID. Kept
// for tests and bootstrap paths that want the eager-spawn behavior. Production
// runs should prefer SpawnRole (lazy) — see the brief's principle that "this
// pattern is expensive so nesting must be efficient and not overdone."
//
// Returns the list of spawned (roleID, agentID) pairs in declaration order.
func (r *OrgRunner) SpawnAll(ctx context.Context, runID string) ([]Spawned, error) {
	if r.Def == nil {
		return nil, errors.New("orgrunner: no org definition")
	}
	// V89: spawn parents BEFORE children so findParentAgentID resolves
	// correctly. Topological order = reverse delegates_to graph. If the
	// yaml happens to declare children first, the prior code would have
	// left their parent_agent_id NULL.
	ordered := topoSortRoles(r.Def.Roles)
	var out []Spawned
	for _, role := range ordered {
		ag, err := r.SpawnRole(ctx, runID, role.ID)
		if err != nil {
			return out, fmt.Errorf("orgrunner: role %s: %w", role.ID, err)
		}
		// SpawnRole stores the *Spawned in r.agents; rebuild for return.
		r.mu.Lock()
		s := r.agents[role.ID]
		r.mu.Unlock()
		if s != nil {
			out = append(out, *s)
		} else {
			out = append(out, Spawned{RoleID: role.ID, AgentID: ag.ID, Agent: ag})
		}
	}
	return out, nil
}

// topoSortRoles returns the roles in an order such that every role appears
// AFTER any role that delegates to it (i.e. parents first). Falls back to
// declaration order on cycles (validate() rejects cycles at load, V68, so
// this branch shouldn't fire).
func topoSortRoles(roles []org.Role) []org.Role {
	idx := make(map[string]int, len(roles))
	for i, r := range roles {
		idx[r.ID] = i
	}
	visited := make(map[string]bool, len(roles))
	inProgress := make(map[string]bool, len(roles))
	var out []org.Role
	var visit func(id string) bool
	visit = func(id string) bool {
		if visited[id] {
			return true
		}
		if inProgress[id] {
			return false // cycle — fall back
		}
		inProgress[id] = true
		i, ok := idx[id]
		if ok {
			// Walk parents first (anyone whose DelegatesTo includes us).
			for _, other := range roles {
				for _, child := range other.DelegatesTo {
					if child == id {
						if !visit(other.ID) {
							return false
						}
					}
				}
			}
			out = append(out, roles[i])
		}
		visited[id] = true
		inProgress[id] = false
		return true
	}
	for _, r := range roles {
		if !visit(r.ID) {
			// Cycle detected; return declaration order.
			return roles
		}
	}
	return out
}

// SpawnRole is the idempotent get-or-spawn primitive used by the lazy spawn
// path. If the role is already spawned, returns the cached Agent. Otherwise
// looks up the role in the org definition and runs the full spawn pipeline.
//
// Used by master.delegate so a worker is only created when a prompt actually
// routes to it — failed runs no longer pay the cost of spawning every role
// in the org just to discover the classifier rejected the prompt.
func (r *OrgRunner) SpawnRole(ctx context.Context, runID, roleID string) (*runtime.Agent, error) {
	// V79: serialize concurrent calls for the same role. Loop because
	// after waiting on someone else's inflight channel, the cache may
	// or may not have a result (their spawn could have errored).
	for {
		r.mu.Lock()
		if existing, ok := r.agents[roleID]; ok && existing != nil {
			r.mu.Unlock()
			return existing.Agent, nil
		}
		if waitCh, inProgress := r.inflight[roleID]; inProgress {
			r.mu.Unlock()
			select {
			case <-waitCh:
				// Re-check the cache; the in-progress spawn completed
				// (successfully or not).
				continue
			case <-ctx.Done():
				return nil, ctx.Err()
			}
		}
		// We're the first; claim the inflight slot.
		done := make(chan struct{})
		r.inflight[roleID] = done
		r.mu.Unlock()

		if r.Def == nil {
			r.clearInflight(roleID, done)
			return nil, errors.New("orgrunner: no org definition")
		}
		var role *org.Role
		for i := range r.Def.Roles {
			if r.Def.Roles[i].ID == roleID {
				role = &r.Def.Roles[i]
				break
			}
		}
		if role == nil {
			r.clearInflight(roleID, done)
			return nil, fmt.Errorf("orgrunner: role %q not in org definition", roleID)
		}
		ag, err := r.spawnOne(ctx, runID, *role)
		r.clearInflight(roleID, done)
		return ag, err
	}
}

// clearInflight removes the in-progress marker and signals waiters.
func (r *OrgRunner) clearInflight(roleID string, done chan struct{}) {
	r.mu.Lock()
	if cur := r.inflight[roleID]; cur == done {
		delete(r.inflight, roleID)
	}
	r.mu.Unlock()
	close(done)
}

// SpawnedAgents returns a snapshot of currently-spawned (agentID → Agent).
// Used by the runlive poll loop, which iterates this set rather than a static
// list captured at SpawnAll time — so newly lazy-spawned agents start being
// polled the next iteration.
func (r *OrgRunner) SpawnedAgents() map[string]*runtime.Agent {
	r.mu.Lock()
	defer r.mu.Unlock()
	out := make(map[string]*runtime.Agent, len(r.agents))
	for _, s := range r.agents {
		if s != nil && s.Agent != nil {
			out[s.AgentID] = s.Agent
		}
	}
	return out
}

// SpawnedList returns the spawned (roleID, agentID, Agent) records in
// no particular order. Used by the cleanup pass.
func (r *OrgRunner) SpawnedList() []Spawned {
	r.mu.Lock()
	defer r.mu.Unlock()
	out := make([]Spawned, 0, len(r.agents))
	for _, s := range r.agents {
		if s != nil {
			out = append(out, *s)
		}
	}
	return out
}

// Spawned is one (role,agent) pair the runner produced.
type Spawned struct {
	RoleID  string
	AgentID string
	Agent   *runtime.Agent
}

func (r *OrgRunner) spawnOne(ctx context.Context, runID string, role org.Role) (*runtime.Agent, error) {
	zoneScope := r.zoneFor(role.ID)
	tools := role.Tools
	if len(tools) == 0 {
		tools = []string{"introspect"}
	}
	agentID := role.ID + "-" + shortRunID(runID)

	// Step 1: pick runtime.
	_, _ = r.Orch.Bus.Emit(ctx, event.Event{
		Kind: "agent.spawn.routing", RunID: runID, AgentID: agentID,
		Payload: map[string]any{"role": role.ID, "provider": role.Provider},
	})
	rt, err := r.pickRuntime(role.Provider)
	if err != nil {
		_, _ = r.Orch.Bus.Emit(ctx, event.Event{
			Kind: "agent.spawn.failed", RunID: runID, AgentID: agentID,
			Payload: map[string]any{"role": role.ID, "provider": role.Provider, "stage": "pick_runtime", "error": err.Error()},
		})
		return nil, err
	}

	// Step 2: insert agent row. Populate parent_agent_id from whichever
	// already-spawned peer's role's DelegatesTo includes this agent's
	// role. With lazy spawn the parent should always be spawned first
	// (it's the one calling SpawnRole on the child), so the lookup
	// resolves. For eager SpawnAll, the parent may not exist yet if it's
	// declared later in the org yaml — in that case the link is set to
	// NULL and Introspect falls back to "master".
	parentAgentID := r.findParentAgentID(role.ID)
	now := store.Now()
	err = r.Orch.St.Tx(ctx, func(q store.Querier) error {
		_, err := q.Exec(
			`INSERT INTO agents(id, role, run_id, provider, status, zone_scope_json, spawned_at, heartbeat_at, parent_agent_id)
			 VALUES(?, ?, ?, ?, 'spawning', ?, ?, ?, ?)`,
			agentID, role.ID, runID, role.Provider,
			zoneScopeJSON(zoneScope),
			store.FmtTime(now), store.FmtTime(now),
			nullableStr(parentAgentID),
		)
		return err
	})
	if err != nil {
		_, _ = r.Orch.Bus.Emit(ctx, event.Event{
			Kind: "agent.spawn.failed", RunID: runID, AgentID: agentID,
			Payload: map[string]any{"role": role.ID, "stage": "insert_row", "error": err.Error()},
		})
		return nil, err
	}

	// Step 3: call runtime.Spawn — bound it so a hung Spawn surfaces as an
	// error instead of holding the whole run forever. Builds the system
	// prompt from the role's definition md file (if present) and renders
	// the introspection placeholders.
	_, _ = r.Orch.Bus.Emit(ctx, event.Event{
		Kind: "agent.spawn.calling_runtime", RunID: runID, AgentID: agentID,
		Payload: map[string]any{"provider": role.Provider, "definition": role.Definition},
	})

	// Resolve the role definition path relative to the org yaml's directory,
	// or as an absolute path, or relative to the .td/ dir (legacy).
	//
	// Every spawn records a full "what got loaded from where" trail so the
	// dashboard can show the user: role template path, rendered system
	// prompt (saved as artifact), tool allowlist, zone scope, peer list.
	systemPrompt := ""
	var resolvedPath, templateSha string
	templateBytes := 0
	// OC3: pick the override path when set, falling back to base Definition.
	// resolveRolePath sandboxes whichever we pick (V88).
	effectiveDef := org.ResolveSystemPromptPath(role)
	if effectiveDef != "" {
		tmpAgent := runtime.NewAgent(rt, r.Orch.St, r.Orch.Bus, r.Queue,
			agentID, role.ID, runID, zoneScope, tools, role.Provider, r.CostPerKTokDefault,
		)
		// Populate StaticPeers from the org definition so {{peers}} expands
		// even when siblings haven't been spawned yet (master is FIRST in
		// the topo-sorted spawn order — without this its prompt says
		// "team: ****" and the LLM can't dispatch).
		if r.Def != nil {
			for _, other := range r.Def.Roles {
				if other.ID == role.ID {
					continue
				}
				tmpAgent.StaticPeers = append(tmpAgent.StaticPeers, runtime.Peer{Role: other.ID})
			}
		}
		resolvedPath = r.resolveRolePath(effectiveDef)
		if p, err := tmpAgent.RenderSystemPrompt(ctx, resolvedPath); err == nil {
			systemPrompt = p
		}
		// Read raw bytes for telemetry.
		if raw, err := os.ReadFile(resolvedPath); err == nil {
			templateBytes = len(raw)
			h := sha256.Sum256(raw)
			templateSha = hex.EncodeToString(h[:8])
		}
		if systemPrompt == "" {
			_, _ = r.Orch.Bus.Emit(ctx, event.Event{
				Kind: "agent.role_template_missing", RunID: runID, AgentID: agentID,
				Payload: map[string]any{
					"role": role.ID, "definition": role.Definition, "resolved_path": resolvedPath,
					"hint": "create the role md file or update the yaml definition path; agents spawn uninstructed without it",
				},
			})
		}
	}

	// Persist the rendered prompt as a spawn-context artifact so the UI can
	// show "exactly what the LLM saw on spawn" without needing to be the
	// runtime.
	if systemPrompt != "" {
		// V75: anchor to runner.ProjectRoot if set, otherwise CWD-relative.
		// Without anchoring, spawn artifacts land in whichever directory
		// the process happened to start from instead of the per-run
		// sandbox the test fixture / production deployment configured.
		spawnDir := filepath.Join(".td", "runs", runID, "agents", agentID)
		if r.ProjectRoot != "" && !filepath.IsAbs(spawnDir) {
			spawnDir = filepath.Join(r.ProjectRoot, spawnDir)
		}
		_ = os.MkdirAll(spawnDir, 0o755)
		_ = os.WriteFile(filepath.Join(spawnDir, "system-prompt.md"), []byte(systemPrompt), 0o644)
		_ = os.WriteFile(filepath.Join(spawnDir, "spawn-context.json"), spawnContextJSON(map[string]any{
			"agent_id":       agentID,
			"role":           role.ID,
			"provider":       role.Provider,
			"definition":     role.Definition,
			"resolved_path":  resolvedPath,
			"template_bytes": templateBytes,
			"template_sha":   templateSha,
			"tools":          tools,
			"zone_scope":     zoneScope,
			"rendered_bytes": len(systemPrompt),
		}), 0o644)
	}

	// Emit the loaded-context event with everything an operator needs to
	// reproduce/debug the agent's starting conditions.
	_, _ = r.Orch.Bus.Emit(ctx, event.Event{
		Kind: "agent.role_loaded", RunID: runID, AgentID: agentID,
		Payload: map[string]any{
			"role":           role.ID,
			"provider":       role.Provider,
			"definition":     role.Definition,
			"resolved_path":  resolvedPath,
			"template_bytes": templateBytes,
			"template_sha":   templateSha,
			"rendered_bytes": len(systemPrompt),
			"tools":          tools,
			"zone_scope":     zoneScope,
		},
	})

	spawnCtx, cancelSpawn := context.WithTimeout(ctx, 30*time.Second)
	defer cancelSpawn()
	if _, err := rt.Spawn(spawnCtx, runtime.SpawnSpec{
		AgentID: agentID, Role: role.ID, Provider: role.Provider,
		RunID:   runID, Tools: tools, ZoneScope: zoneScope,
		Prompt:  systemPrompt,
	}); err != nil {
		_, _ = r.Orch.Bus.Emit(ctx, event.Event{
			Kind: "agent.spawn.failed", RunID: runID, AgentID: agentID,
			Payload: map[string]any{
				"role": role.ID, "provider": role.Provider,
				"stage": "runtime_spawn", "error": err.Error(),
				"hint":  hintForSpawnError(role.Provider, err),
			},
		})
		// Mark the agent terminated so the row reflects reality.
		_ = r.Orch.St.Tx(ctx, func(q store.Querier) error {
			_, e := q.Exec(`UPDATE agents SET status='failed', terminated_at=? WHERE id=?`,
				store.FmtTime(store.Now()), agentID)
			return e
		})
		return nil, fmt.Errorf("runtime.Spawn(%s, provider=%s): %w", role.ID, role.Provider, err)
	}

	// Step 4: announce ready. Payload carries the FULL structural
	// identity (role, provider, sub_org, tools, is_connector) so that
	// downstream tooling — grader, self-report CLI, dashboard — can
	// answer "what agents exist and what are their traits?" without
	// depending on the agent's own LLM cooperating. This is the
	// observability foundation that lets voluntary `self_report` tool
	// calls layer on richer self-narration without being the sole
	// source of identity truth.
	var subOrgID string
	var isConnector bool
	if r.Def != nil {
		subOrgID = org.SubOrgByRole(r.Def)[role.ID]
		isConnector = org.IsConnectorRole(role)
	}
	_, _ = r.Orch.Bus.Emit(ctx, event.Event{
		Kind: event.KindAgentSpawned, RunID: runID, AgentID: agentID,
		Payload: map[string]any{
			"role":         role.ID,
			"provider":     role.Provider,
			"sub_org":      subOrgID,
			"tools":        role.Tools,
			"is_connector": isConnector,
			"zone_scope":   zoneScope,
		},
	})
	_ = r.Orch.St.Tx(ctx, func(q store.Querier) error {
		_, err := q.Exec(`UPDATE agents SET status='running' WHERE id=?`, agentID)
		return err
	})

	agent := runtime.NewAgent(rt, r.Orch.St, r.Orch.Bus, r.Queue,
		agentID, role.ID, runID, zoneScope, tools, role.Provider, r.CostPerKTokDefault,
	)
	agent.ProjectRoot = r.ProjectRoot
	// Populate StaticPeers — same reason as in the tmpAgent above:
	// HandleOne-time prompt renders also need the team.
	if r.Def != nil {
		for _, other := range r.Def.Roles {
			if other.ID == role.ID {
				continue
			}
			agent.StaticPeers = append(agent.StaticPeers, runtime.Peer{Role: other.ID})
		}
	}
	// V84: wire cross-suborg routing policy. SubOrgByRole + the per-org
	// policy flag let send_message/delegate reject illegal hops
	// without needing the org def at tool-call time.
	if r.Def != nil {
		agent.SubOrgByRole = org.SubOrgByRole(r.Def)
		agent.SubOrg = agent.SubOrgByRole[role.ID]
		agent.CrossSubOrgConn = r.Def.Policies.CrossSubOrgConn
		agent.IsConnector = org.IsConnectorRole(role)
	}
	// Role→agent resolver: lets the delegate / send_message tools
	// target roles by name and lazy-spawn when needed. Without this,
	// `delegate.to: "fe-worker"` produces an envelope the queue can't
	// deliver (it's looking for an agent_id, not a role).
	runnerRef := r
	currentRunID := runID
	agent.ResolveRoleAgent = func(roleID string) string {
		if id, ok := runnerRef.AgentIDFor(roleID); ok {
			return id
		}
		if _, err := runnerRef.SpawnRole(context.Background(), currentRunID, roleID); err != nil {
			return ""
		}
		id, _ := runnerRef.AgentIDFor(roleID)
		return id
	}
	// OC4: agents whose role allowlist includes any org.* / role.* tool
	// need the configsvc façade. Always install if the runner has one —
	// the allowlist guard in executeTool will still reject use by roles
	// that don't list them.
	if r.ConfigSvc != nil {
		agent.SetConfigSvc(r.ConfigSvc)
	}
	r.mu.Lock()
	r.agents[role.ID] = &Spawned{RoleID: role.ID, AgentID: agentID, Agent: agent}
	r.mu.Unlock()
	return agent, nil
}

// resolveRolePath resolves a role definition path. V88: confines the
// resolution to either ProjectRoot or the org yaml's directory tree so a
// malicious org yaml can't point at /etc/passwd (or any other file outside
// the project sandbox). Absolute paths are rejected. Relative paths that
// would escape (../../etc) are rejected after Clean.
func (r *OrgRunner) resolveRolePath(def string) string {
	if def == "" {
		return ""
	}
	if filepath.IsAbs(def) {
		return "" // disallowed; rendered prompt will be empty and a role_template_missing event will fire
	}
	clean := filepath.Clean(def)
	if strings.HasPrefix(clean, "..") {
		return "" // escapes the project tree
	}
	// Try resolving relative to the org yaml's directory tree (walking up
	// a few levels to find common patterns like `.td/roles/...`).
	if r.Def != nil && r.Def.SourcePath != "" {
		base := filepathDir(r.Def.SourcePath)
		for i := 0; i < 4; i++ {
			candidate := filepathJoin(base, clean)
			if !pathWithin(candidate, r.ProjectRoot, base) {
				break
			}
			if _, err := osStat(candidate); err == nil {
				return candidate
			}
			base = filepathDir(base)
		}
	}
	// ProjectRoot-anchored fallback to `.td/<def>`.
	candidate := filepathJoin(".td", clean)
	if r.ProjectRoot != "" {
		candidate = filepathJoin(r.ProjectRoot, ".td", clean)
	}
	if !pathWithin(candidate, r.ProjectRoot, "") {
		return ""
	}
	return candidate
}

// pathWithin checks whether `candidate` is inside any of the allowed
// ancestor roots. Empty roots are skipped. Symlinks are resolved when
// possible — without it, a symlink inside the project root could redirect
// to /etc/passwd and pass the check.
func pathWithin(candidate string, roots ...string) bool {
	absCand, err := filepath.Abs(candidate)
	if err != nil {
		return false
	}
	if real, err := filepath.EvalSymlinks(absCand); err == nil {
		absCand = real
	}
	for _, root := range roots {
		if root == "" {
			continue
		}
		absRoot, err := filepath.Abs(root)
		if err != nil {
			continue
		}
		if real, err := filepath.EvalSymlinks(absRoot); err == nil {
			absRoot = real
		}
		rel, err := filepath.Rel(absRoot, absCand)
		if err == nil && !strings.HasPrefix(rel, "..") && rel != ".." {
			return true
		}
	}
	return false
}

// hintForSpawnError translates a runtime.Spawn error into a human-readable
// hint surfaced in the spawn.failed event so the operator can act on it
// without reading code or stack traces.
func hintForSpawnError(provider string, err error) string {
	msg := err.Error()
	switch {
	case contains(msg, "context deadline exceeded"):
		return "Spawn took >30s. The CLI subprocess may be hung at an interactive prompt (auth, EULA). Open the tmux session manually to check: `tmux -L harness ls`."
	case contains(msg, "tmux not available"):
		return "tmux binary not on PATH. Install via `brew install tmux` (macOS) or your package manager."
	case contains(msg, "claude-code: tmux") || contains(msg, "executable file not found"):
		switch provider {
		case "claude-code":
			return "`claude` CLI not on PATH. Install Claude Code, or use --runtime scripted for demos."
		case "codex":
			return "`codex` CLI not on PATH. Install via npm install -g @openai/codex, or use --runtime scripted."
		case "gemini":
			return "`gemini` CLI not on PATH. Install via npm install -g @google/gemini-cli, or use --runtime scripted."
		}
	}
	return "see error message above"
}

func contains(s, sub string) bool {
	return len(sub) > 0 && len(s) >= len(sub) && (s == sub || indexOf(s, sub) >= 0)
}
func indexOf(s, sub string) int {
	n, m := len(s), len(sub)
	if m == 0 {
		return 0
	}
	for i := 0; i+m <= n; i++ {
		if s[i:i+m] == sub {
			return i
		}
	}
	return -1
}

func (r *OrgRunner) pickRuntime(provider string) (runtime.Runtime, error) {
	if rt, ok := r.Runtimes[provider]; ok {
		return rt, nil
	}
	// L3 / live-behavior-audit: previously fell back to "default" silently.
	// That made `runtime_mode=claude-code` quietly run claude even when the
	// org said `provider: codex` — the user thought they were testing codex
	// but the harness was running claude. Now: refuse to start. The fix is
	// either (a) pick a runtime mode that includes the org's providers
	// (e.g. `mixed`), or (b) edit the org to use the providers the mode
	// supports. Emit a list of available providers in the error so the
	// remedy is obvious.
	available := make([]string, 0, len(r.Runtimes))
	for k := range r.Runtimes {
		if k == "default" {
			continue
		}
		available = append(available, k)
	}
	sort.Strings(available)
	return nil, fmt.Errorf(
		"orgrunner: no runtime registered for provider %q (available: %v). "+
			"Org wants a provider the chosen runtime mode doesn't expose; "+
			"falling back to default would silently swap the model and falsify the experiment",
		provider, available,
	)
}

// zoneFor returns the zone scope globs for a role by looking up which sub-org
// claims it (via team membership), then returning that sub-org's zone_scope.
func (r *OrgRunner) zoneFor(roleID string) []string {
	for _, t := range r.Def.Teams {
		for _, m := range t.Roster {
			if m == roleID {
				// Find the sub-org that includes this team.
				for _, so := range r.Def.SubOrgs {
					for _, tid := range so.Teams {
						if tid == t.ID {
							return so.ZoneScope
						}
					}
				}
			}
		}
	}
	return nil
}

// DelegateTarget chooses a role to delegate to from `fromRole`'s
// delegates_to list. Phase N heuristic: first declared delegate target.
// (Phase O could pick by prompt kind / load / latency.)
func (r *OrgRunner) DelegateTarget(fromRole string) (string, bool) {
	for _, role := range r.Def.Roles {
		if role.ID == fromRole && len(role.DelegatesTo) > 0 {
			return role.DelegatesTo[0], true
		}
	}
	return "", false
}

// DelegateTargets returns the FULL delegates_to list for fromRole. Used
// by introspection / fan-out flows where the master needs to dispatch
// the same prompt to every declared child instead of picking one.
// Cross-suborg connector refs (@-prefixed) are excluded — those are
// routing endpoints, not first-class workers.
func (r *OrgRunner) DelegateTargets(fromRole string) []string {
	for _, role := range r.Def.Roles {
		if role.ID != fromRole {
			continue
		}
		out := make([]string, 0, len(role.DelegatesTo))
		for _, t := range role.DelegatesTo {
			if strings.HasPrefix(t, "@") {
				continue
			}
			out = append(out, t)
		}
		return out
	}
	return nil
}

// findParentAgentID walks the org definition's delegates_to graph backwards:
// any role whose DelegatesTo list contains childRoleID is a candidate parent.
// Returns the agentID of the first already-spawned candidate, or "" if none.
// "Master" is by definition root and has no parent.
func (r *OrgRunner) findParentAgentID(childRoleID string) string {
	if r.Def == nil || childRoleID == "master" {
		return ""
	}
	r.mu.Lock()
	defer r.mu.Unlock()
	for _, role := range r.Def.Roles {
		for _, child := range role.DelegatesTo {
			if child == childRoleID {
				if s, ok := r.agents[role.ID]; ok && s != nil {
					return s.AgentID
				}
			}
		}
	}
	return ""
}

// nullableStr is the orgrunner-local helper used to convert an empty string
// into a SQL NULL for nullable columns.
func nullableStr(s string) any {
	if s == "" {
		return nil
	}
	return s
}

// AgentIDFor returns the spawned agentID for a roleID. Returns ("", false)
// if the role has not yet been spawned (with lazy spawn this is a normal
// state, not an error).
// AgentByRole returns the live *runtime.Agent for a role id, or (nil, false)
// if the role hasn't been spawned. Used by runlive to install per-role
// hooks (e.g. PostInboxHook for master in LLM-driven mode).
func (r *OrgRunner) AgentByRole(roleID string) (*runtime.Agent, bool) {
	r.mu.Lock()
	defer r.mu.Unlock()
	s, ok := r.agents[roleID]
	if !ok || s == nil {
		return nil, false
	}
	return s.Agent, true
}

func (r *OrgRunner) AgentIDFor(roleID string) (string, bool) {
	r.mu.Lock()
	defer r.mu.Unlock()
	s, ok := r.agents[roleID]
	if !ok || s == nil {
		return "", false
	}
	return s.AgentID, true
}

func shortRunID(runID string) string {
	if len(runID) > 8 {
		return runID[len(runID)-8:]
	}
	return runID
}

func zoneScopeJSON(zones []string) string {
	if len(zones) == 0 {
		return "[]"
	}
	out := "["
	for i, z := range zones {
		if i > 0 {
			out += ","
		}
		out += `"` + z + `"`
	}
	out += "]"
	return out
}