// LLMJudge is a Classifier that asks a small Anthropic model to route the
// prompt. Unlike Rules it understands paraphrase, doesn't choke on prompts
// without imperative verbs, and can pick a steering target by *content*
// instead of literal task-id substring match.
//
// Talks to the Anthropic Messages API directly (same wire format as
// runtime/headless). Kept independent of the headless runtime because that
// one is per-agent stateful and we want one-shot, no-history calls here.

package prompt

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"net/http"
	"os"
	"strings"
	"time"
)

const (
	llmEndpoint   = "https://api.anthropic.com/v1/messages"
	llmAPIVersion = "2023-06-01"
	llmModel      = "claude-haiku-4-5-20251001"
)

// LLMJudge is an LLM-backed Classifier. If the API call fails or returns
// malformed output, Classify falls back to Fallback (typically a *Rules) so
// the master still gets a routable answer instead of a hard error.
type LLMJudge struct {
	APIKey    string
	Model     string
	Endpoint  string
	HTTP      *http.Client
	Fallback  Classifier // used on transport/parse error; required
	MaxTokens int
}

// NewLLMJudge constructs a judge backed by Anthropic Messages API with the
// given API key. Fallback is required (use NewRules() if you have nothing
// better) — without it a transient network blip would crash the master loop.
func NewLLMJudge(apiKey string, fallback Classifier) *LLMJudge {
	return &LLMJudge{
		APIKey:    apiKey,
		Model:     llmModel,
		Endpoint:  llmEndpoint,
		HTTP:      &http.Client{Timeout: 10 * time.Second},
		Fallback:  fallback,
		MaxTokens: 256,
	}
}

// NewClassifierFromEnv picks the active classifier per env:
//
//	HARNESS_CLASSIFIER=rules    → deterministic keyword matcher (legacy)
//	HARNESS_CLASSIFIER=llm      → force LLM (errors if no API key)
//	(unset, default)            → LLM if ANTHROPIC_API_KEY is present, else Rules
//
// The LLM path always wraps Rules as a fallback so a network blip cannot
// break dispatch.
func NewClassifierFromEnv() Classifier {
	rules := NewRules()
	switch strings.ToLower(os.Getenv("HARNESS_CLASSIFIER")) {
	case "rules":
		return rules
	case "llm":
		key := os.Getenv("ANTHROPIC_API_KEY")
		if key == "" {
			// Loud fallback: user asked for LLM but didn't set the key.
			fmt.Fprintln(os.Stderr,
				"[prompt] HARNESS_CLASSIFIER=llm but ANTHROPIC_API_KEY unset — falling back to rules")
			return rules
		}
		return NewLLMJudge(key, rules)
	default:
		if key := os.Getenv("ANTHROPIC_API_KEY"); key != "" {
			return NewLLMJudge(key, rules)
		}
		return rules
	}
}

// Classify sends the user text + open-task list to the LLM and parses the
// JSON answer. On any failure (HTTP error, malformed output, invalid kind)
// it calls the Fallback classifier and prepends the failure reason to its
// rationale so the dashboard can see what happened.
func (j *LLMJudge) Classify(ctx context.Context, in Input) (Result, error) {
	if strings.TrimSpace(in.Text) == "" {
		return Result{Kind: KindUnknown, Confidence: 0, Rationale: "empty"}, nil
	}
	res, err := j.callOnce(ctx, in)
	if err == nil {
		return res, nil
	}
	// Fallback path: never hard-fail dispatch on a classifier error.
	fb, fbErr := j.Fallback.Classify(ctx, in)
	if fbErr != nil {
		return Result{}, fmt.Errorf("llm_judge: %w; fallback: %v", err, fbErr)
	}
	fb.Rationale = fmt.Sprintf("[llm-judge fallback: %v] %s", err, fb.Rationale)
	return fb, nil
}

func (j *LLMJudge) callOnce(ctx context.Context, in Input) (Result, error) {
	systemPrompt := buildJudgeSystem(in.OpenTasks)
	userMsg := in.Text

	body := map[string]any{
		"model":      j.Model,
		"max_tokens": j.MaxTokens,
		"system":     systemPrompt,
		"messages": []map[string]any{
			{"role": "user", "content": []map[string]any{
				{"type": "text", "text": userMsg},
			}},
		},
	}
	raw, err := json.Marshal(body)
	if err != nil {
		return Result{}, fmt.Errorf("marshal: %w", err)
	}
	req, err := http.NewRequestWithContext(ctx, http.MethodPost, j.Endpoint, bytes.NewReader(raw))
	if err != nil {
		return Result{}, err
	}
	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("x-api-key", j.APIKey)
	req.Header.Set("anthropic-version", llmAPIVersion)

	resp, err := j.HTTP.Do(req)
	if err != nil {
		return Result{}, fmt.Errorf("http: %w", err)
	}
	defer resp.Body.Close()
	respBody, _ := io.ReadAll(resp.Body)
	if resp.StatusCode/100 != 2 {
		return Result{}, fmt.Errorf("api %d: %s", resp.StatusCode, trim(string(respBody), 200))
	}

	var wire struct {
		Content []struct {
			Type string `json:"type"`
			Text string `json:"text"`
		} `json:"content"`
	}
	if err := json.Unmarshal(respBody, &wire); err != nil {
		return Result{}, fmt.Errorf("decode envelope: %w", err)
	}
	var text string
	for _, b := range wire.Content {
		if b.Type == "text" {
			text += b.Text
		}
	}
	if text == "" {
		return Result{}, errors.New("empty content")
	}
	return parseJudgeJSON(text, in.OpenTasks)
}

// parseJudgeJSON extracts the JSON object from the model's reply. Tolerates
// ```json fences and stray prose around the object, because LLMs do that.
// Validates the kind against the known set and the target_task_id against
// the open-task list.
func parseJudgeJSON(text string, open []TaskSummary) (Result, error) {
	body := extractJSONObject(text)
	if body == "" {
		return Result{}, fmt.Errorf("no json object in reply: %s", trim(text, 120))
	}
	var w struct {
		Kind         string  `json:"kind"`
		Confidence   float64 `json:"confidence"`
		TargetTaskID string  `json:"target_task_id"`
		Rationale    string  `json:"rationale"`
	}
	if err := json.Unmarshal([]byte(body), &w); err != nil {
		return Result{}, fmt.Errorf("decode judge json: %w (body: %s)", err, trim(body, 120))
	}
	k := Kind(strings.TrimSpace(w.Kind))
	if !validKind(k) {
		return Result{}, fmt.Errorf("unknown kind %q from judge", w.Kind)
	}
	// If the judge claimed steering but didn't name a valid open task, demote
	// to unknown rather than handing the master an unroutable steer.
	if k == KindSteering {
		if w.TargetTaskID == "" || !openContains(open, w.TargetTaskID) {
			return Result{
				Kind:       KindUnknown,
				Confidence: 0.3,
				Rationale:  "judge said steering but target_task_id missing/invalid: " + w.Rationale,
			}, nil
		}
	}
	if w.Confidence < 0 {
		w.Confidence = 0
	} else if w.Confidence > 1 {
		w.Confidence = 1
	}
	return Result{
		Kind:         k,
		Confidence:   w.Confidence,
		TargetTaskID: w.TargetTaskID,
		Rationale:    w.Rationale,
	}, nil
}

// extractJSONObject finds the first balanced {...} run in s. Handles the
// common cases: bare JSON, JSON inside ```json fences, JSON after preamble.
// Quote-aware so braces inside strings don't break the balance.
func extractJSONObject(s string) string {
	start := strings.IndexByte(s, '{')
	if start < 0 {
		return ""
	}
	depth := 0
	inStr := false
	esc := false
	for i := start; i < len(s); i++ {
		c := s[i]
		if inStr {
			if esc {
				esc = false
				continue
			}
			if c == '\\' {
				esc = true
				continue
			}
			if c == '"' {
				inStr = false
			}
			continue
		}
		switch c {
		case '"':
			inStr = true
		case '{':
			depth++
		case '}':
			depth--
			if depth == 0 {
				return s[start : i+1]
			}
		}
	}
	return ""
}

func validKind(k Kind) bool {
	switch k {
	case KindImplementNew, KindImplementSub, KindImplementCrossBound,
		KindDebugDiscovery, KindDebugOwned, KindInfoUnblock,
		KindConversational, KindSteering, KindEvaluation, KindUnknown:
		return true
	}
	return false
}

func openContains(tasks []TaskSummary, id string) bool {
	for _, t := range tasks {
		if t.ID == id {
			return true
		}
	}
	return false
}

func buildJudgeSystem(open []TaskSummary) string {
	var sb strings.Builder
	sb.WriteString(`You are the prompt router for a multi-agent coding harness. Classify the user's prompt into exactly one intent and reply with a single JSON object — no prose, no markdown fence.

Intents:
- implement.new-feature     : new functionality the system doesn't have yet ("add X", "support Y", "build Z", "make it possible to ...")
- implement.sub-feature     : refinement or extension of an existing feature
- implement.cross-boundary  : refactor/rename/migrate spanning multiple components
- debug.discovery           : symptom report needing investigation ("broken", "fails", "why is X", "Z is wrong")
- debug.owned               : continued debugging of an already-known issue
- info.unblock              : the user is stuck and needs an answer before they can proceed
- conversational            : question or chat that does NOT ask the system to do work
- steering                  : redirect / override / continuation of an in-flight task — set target_task_id to the task being redirected
- evaluation                : asking to rate / score / review a prior result
- unknown                   : truly cannot tell (use sparingly — prefer the closest fit)

Prefer the most specific applicable intent over 'unknown'. If the user describes any concrete change, complaint, or request — even without an imperative verb — pick the closest implement.* or debug.* intent and explain in rationale.

Output schema (strict — no extra keys, no markdown):
{"kind": "<intent>", "confidence": <0.0-1.0>, "target_task_id": "<id or empty string>", "rationale": "<one short sentence>"}
`)
	if len(open) > 0 {
		sb.WriteString("\nOpen in-flight tasks you may steer (id · state · title):\n")
		for _, t := range open {
			sb.WriteString("- ")
			sb.WriteString(t.ID)
			sb.WriteString(" · ")
			sb.WriteString(t.State)
			sb.WriteString(" · ")
			sb.WriteString(trim(t.Title, 80))
			sb.WriteString("\n")
		}
	} else {
		sb.WriteString("\n(no in-flight tasks — steering is not a valid choice right now)\n")
	}
	return sb.String()
}

func trim(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}