// Package cli is the non-interactive CLI runtime. Each LLM call shells out // to `

` and waits for the process to // exit, capturing clean stdout. No TTY, no TUI rendering, no send-keys // guessing. // // This is the production-grade real-CLI path: // // claude -p "..." non-interactive Claude Code one-shot mode // codex exec "..." non-interactive Codex one-shot mode // // Each turn = one process. No session state shared across turns; the // caller passes accumulated conversation history in the prompt or via // --session-id (if the CLI supports it). // // Compared to the tmux/send-keys runtime this trades session continuity // for reliability: every turn produces parseable stdout we can extract a // harness-out block from. That's the right trade for harness use. package cli import ( "bytes" "context" "errors" "fmt" "os" "os/exec" "path/filepath" "strings" "sync" "time" "github.com/flothus/tmux-xterm-research/server-go/internal/harness/runtime" "github.com/flothus/tmux-xterm-research/server-go/internal/harness/runtime/tmux" ) // Profile names a non-interactive CLI mode. type Profile struct { // Name identifies the provider (e.g. "claude-code-print", "codex-exec"). Name string // Command is the executable to run. Command string // Args are static flags appended before the prompt. Args []string // PromptVia controls how the prompt is delivered: "stdin" (default, // preferred) or "arg" (last positional argument). PromptVia string // TokenEst is the per-character token estimator divisor. TokenEstDivisor float64 } // ClaudePrintProfile is the canonical configuration for `claude -p`. // Reads prompt from stdin via -p flag. var ClaudePrintProfile = Profile{ Name: "claude-print", Command: "claude", Args: []string{"-p"}, PromptVia: "arg", TokenEstDivisor: 4.0, } // CodexExecProfile is the canonical configuration for `codex exec`. // Subscription-billed (ChatGPT Plus/Pro) like the TUI — no API tokens. var CodexExecProfile = Profile{ Name: "codex-exec", Command: "codex", Args: []string{"exec"}, PromptVia: "arg", TokenEstDivisor: 3.6, } // GeminiPrintProfile is the canonical non-interactive configuration for // the Google `gemini` CLI. When the user has cached OAuth from a prior // interactive login, this uses the same Code Assist free-tier quota as // the TUI (Google's docs state quotas are "shared across interactive and // agent modes"). If OAuth isn't cached the CLI falls back to // GEMINI_API_KEY / GOOGLE_API_KEY (paid Gemini API). Pre-flight should // verify OAuth is present if you want subscription billing. var GeminiPrintProfile = Profile{ Name: "gemini-print", Command: "gemini", Args: []string{"-p"}, PromptVia: "arg", TokenEstDivisor: 4.0, } // Runtime implements runtime.Runtime by shelling out per turn. type Runtime struct { Profile Profile // Timeout caps how long one process invocation may run. Timeout time.Duration // LogDir, when set, is the directory where per-agent transcript logs // are written ("/.log"). Same path the tmux pipe-pane // log uses, so the UI's `/api/harness/agents/{id}/pane` endpoint works // uniformly across runtimes — without this, headless / cli print-mode // agents have no live-pane view because the endpoint only reads tmux // log files. Empty disables logging (tests / older callers stay silent). LogDir string mu sync.Mutex agents map[string]*agentState } // NewRuntime constructs a Runtime for the given profile. func NewRuntime(p Profile) *Runtime { return &Runtime{ Profile: p, Timeout: 120 * time.Second, agents: map[string]*agentState{}, } } type agentState struct { systemPrompt string history []turn status string } type turn struct { role string // "user" or "assistant" content string } // PreFlight verifies the CLI is callable end-to-end without hanging on // an interactive modal (update prompt, EULA, auth wizard). Runs // ` --version` with a 5s ceiling. Cheap to call at startup; a hang // here is the L2 signature the live audit caught — the codex/gemini // "no output ever" disasters happen because the CLI is waiting on a // modal we can't see from the harness. func (r *Runtime) PreFlight(ctx context.Context) error { if _, err := exec.LookPath(r.Profile.Command); err != nil { return fmt.Errorf("cli: %s not on PATH: %w", r.Profile.Command, err) } cctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() cmd := exec.CommandContext(cctx, r.Profile.Command, "--version") out, err := cmd.CombinedOutput() if cctx.Err() == context.DeadlineExceeded { return fmt.Errorf("cli: %s --version timed out after 5s (CLI likely waiting on an interactive modal — update prompt, auth flow, EULA)", r.Profile.Command) } if err != nil { return fmt.Errorf("cli: %s --version exit: %w: %s", r.Profile.Command, err, strings.TrimSpace(string(out))) } return nil } // Spawn registers the agent. No subprocess yet — that happens per CallLLM. func (r *Runtime) Spawn(ctx context.Context, spec runtime.SpawnSpec) (string, error) { if _, err := exec.LookPath(r.Profile.Command); err != nil { return "", fmt.Errorf("cli: %s not on PATH: %w", r.Profile.Command, err) } r.mu.Lock() r.agents[spec.AgentID] = &agentState{ systemPrompt: spec.Prompt, status: "running", } r.mu.Unlock() // Seed the per-agent transcript log so the UI's live-pane viewer has // something to show before the first turn fires. Same file the tmux // pane endpoint reads. r.appendLog(spec.AgentID, fmt.Sprintf("[harness] spawned via cli runtime (cmd=%s)\n[harness] system prompt:\n%s\n", r.Profile.Command, spec.Prompt)) return spec.AgentID, nil } // appendLog writes one entry to the per-agent transcript log. Best-effort: // if LogDir is empty or the write fails (perm denied, disk full), we // silently skip — the log is observability, not correctness. The log file // path matches the tmux pipe-pane format so the same /api/harness/agents/ // {id}/pane endpoint serves both runtimes uniformly. func (r *Runtime) appendLog(agentID, body string) { if r.LogDir == "" { return } _ = os.MkdirAll(r.LogDir, 0o755) path := filepath.Join(r.LogDir, agentID+".log") f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { return } defer f.Close() _, _ = f.WriteString(body) } // CallLLM runs one subprocess turn. Builds a full prompt from accumulated // history + the new incoming message + role md (if present), invokes the // CLI, captures stdout, parses the harness-out block. func (r *Runtime) CallLLM(ctx context.Context, agentID string, req runtime.LLMRequest) (*runtime.LLMResponse, error) { r.mu.Lock() state, ok := r.agents[agentID] if !ok { r.mu.Unlock() return nil, runtime.ErrUnknownAgent } // V70: buildPrompt reads state.history; another goroutine could // be appending to it concurrently. Hold the lock across the read so // the slice header isn't raced. prompt := buildPrompt(state, req) r.mu.Unlock() cctx, cancel := context.WithTimeout(ctx, r.Timeout) defer cancel() args := append([]string{}, r.Profile.Args...) if r.Profile.PromptVia == "arg" || r.Profile.PromptVia == "" { args = append(args, prompt) } cmd := exec.CommandContext(cctx, r.Profile.Command, args...) var stdout, stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr if r.Profile.PromptVia == "stdin" { cmd.Stdin = strings.NewReader(prompt) } // Log the outbound prompt before invoking the CLI so the UI's live // pane shows the prompt even if the call hangs/errors before producing // stdout. The format mirrors what xterm renders cleanly for tmux logs. r.appendLog(agentID, fmt.Sprintf("\n[harness] >> turn started %s\n%s\n[harness] >> awaiting CLI...\n", time.Now().UTC().Format(time.RFC3339), prompt)) if err := cmd.Run(); err != nil { errOut := stderr.String() r.appendLog(agentID, fmt.Sprintf("[harness] << CLI error: %v\n%s\n", err, errOut)) return &runtime.LLMResponse{ Text: stdout.String(), ParseFailures: 1, }, fmt.Errorf("cli: %s exit: %w: %s", r.Profile.Command, err, errOut) } out := stdout.String() // Mirror stdout to the per-agent transcript log so the live-pane UI // renders the CLI's response in real time — without this, headless and // cli-print agents look "silent" in the dashboard even though they // produced full responses. r.appendLog(agentID, fmt.Sprintf("[harness] << CLI response (%d bytes)\n%s\n", len(out), out)) // Append to history so future turns see context. r.mu.Lock() state.history = append(state.history, turn{role: "user", content: prompt}, turn{role: "assistant", content: out}, ) r.mu.Unlock() resp, err := parseHarnessOut(out, prompt, r.Profile.TokenEstDivisor) if err != nil { return &runtime.LLMResponse{Text: out, ParseFailures: 1}, err } return resp, nil } // Terminate marks the agent terminated. No subprocess to kill — each turn // already exited. func (r *Runtime) Terminate(ctx context.Context, agentID, reason string) error { r.mu.Lock() defer r.mu.Unlock() if s, ok := r.agents[agentID]; ok { s.status = "terminated" } return nil } // Health returns the agent's runtime status. func (r *Runtime) Health(ctx context.Context, agentID string) (string, error) { r.mu.Lock() defer r.mu.Unlock() if s, ok := r.agents[agentID]; ok { return s.status, nil } return "", runtime.ErrUnknownAgent } // buildPrompt assembles the per-turn prompt: system prompt (first turn // only — subsequent turns reference it implicitly via history), then a // short "incoming message" block + the harness-out output contract. func buildPrompt(state *agentState, req runtime.LLMRequest) string { var sb strings.Builder if len(state.history) == 0 && state.systemPrompt != "" { sb.WriteString(state.systemPrompt) sb.WriteString("\n\n") } if req.IncomingMessage != nil { m := req.IncomingMessage sb.WriteString("[harness-turn]\n") sb.WriteString(fmt.Sprintf("Inbox message id=%s type=%s from=%s task_id=%s\n", m.ID, m.Type, m.From, m.TaskID)) if m.Payload.Intent != "" { sb.WriteString("intent: " + m.Payload.Intent + "\n") } if m.Payload.Expects != "" { sb.WriteString("expects: " + string(m.Payload.Expects) + "\n") } for _, ref := range m.Payload.ContextRefs { sb.WriteString("context_ref: " + ref + "\n") } } else if req.Prompt != "" { sb.WriteString(req.Prompt + "\n") } if len(req.AvailableTools) > 0 { sb.WriteString("\navailable tools: " + strings.Join(req.AvailableTools, ", ") + "\n") } if req.PriorParseError != "" { // L8: corrective context for the next attempt. sb.WriteString("\n[correction] your previous response failed parsing: ") sb.WriteString(req.PriorParseError) sb.WriteString("\nemit EXACTLY one fenced harness-out JSON block, no surrounding prose.\n") } sb.WriteString("\nReply ONLY with one fenced harness-out block of JSON. Schema:\n") sb.WriteString("```harness-out\n{\"text\":\"...\",\"tool_calls\":[{\"name\":\"\",\"args\":{...}}],\"tokens\":{\"prompt\":0,\"completion\":0}}\n```\n") sb.WriteString("Always emit the block, even on a no-op turn. No prose outside it.\n") return sb.String() } // parseHarnessOut extracts the harness-out block from stdout, falling back // to a "reformat retry" path is the caller's responsibility — here we just // produce a clean response or an error. func parseHarnessOut(stdout, prompt string, divisor float64) (*runtime.LLMResponse, error) { body, ok := tmux.ExtractBlock(stdout) if !ok { return nil, errors.New("cli: no harness-out block in stdout") } var wire struct { Text string `json:"text"` ToolCalls []runtime.ToolCall `json:"tool_calls"` Tokens runtime.TokenUsage `json:"tokens"` } if err := jsonUnmarshalRelaxed([]byte(body), &wire); err != nil { return nil, fmt.Errorf("cli: malformed JSON in harness-out: %w", err) } if wire.Tokens.Prompt == 0 && wire.Tokens.Completion == 0 { // Estimate from char count. Guard against divisor=0 (would yield // +Inf and an undefined int conversion). d := divisor if d <= 0 { d = 4.0 } wire.Tokens.Prompt = int(float64(len(prompt)) / d) wire.Tokens.Completion = int(float64(len(body)) / d) } return &runtime.LLMResponse{ Text: wire.Text, ToolCalls: wire.ToolCalls, Tokens: wire.Tokens, }, nil } // jsonUnmarshalRelaxed is encoding/json with a tolerance pass for common // LLM mistakes (trailing commas etc.). For Phase O we use strict json; // extending later if real outputs need it. func jsonUnmarshalRelaxed(raw []byte, v any) error { return jsonStrict(raw, v) } // jsonStrict delegates to encoding/json. Separated so we can swap in a // tolerant parser later without touching call sites. func jsonStrict(raw []byte, v any) error { return jsonImpl(raw, v) }