// Package evaluator is the pluggable evaluation framework. See plan §10.
//
// An evaluator scores a target (task, run, or artifact) against criteria.
// Multiple evaluators with structural independence (clean context, different
// tools, different focus) produce richer signal than a single judge.
//
// Evaluators are first-class rows in the evaluators table; their decisions
// are evaluations rows that can be calibrated against user ratings.
package evaluator

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"strings"
	"sync"

	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/event"
	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/store"
)

// Kind identifies the evaluator implementation strategy.
type Kind string

const (
	KindUserRating      Kind = "user-rating"
	KindLLMJudge        Kind = "llm-judge"
	KindHeuristic       Kind = "heuristic"
	KindFocusedLLMJudge Kind = "focused-llm-judge"
	KindABBlind         Kind = "a-b-blind"
)

// TargetKind discriminates what's being evaluated.
type TargetKind string

const (
	TargetTask     TargetKind = "task"
	TargetRun      TargetKind = "run"
	TargetArtifact TargetKind = "artifact"
)

// Score is a single numeric or qualitative dimension.
type Score struct {
	Dimension string  // e.g. "correctness", "simplicity", "test_coverage"
	Value     float64 // 0..1
	Rationale string
}

// Target describes the thing being evaluated. The evaluator implementation
// decides how to read the target (file, db row, etc).
type Target struct {
	Kind TargetKind
	ID   string
	// RunID lets the evaluator emit RunID on the evaluation.recorded
	// event so dashboards can filter/group without round-tripping through
	// tasks→runs. May be empty; EvaluateAll backfills from tasks.run_id
	// when missing.
	RunID string
	// Path optionally points at an artifact on disk (for heuristic
	// evaluators that run linters).
	Path string
	// Body is the textual content for evaluators that don't read from disk.
	Body string
}

// Evaluator is the pluggable interface.
type Evaluator interface {
	ID() string
	Kind() Kind
	Evaluate(ctx context.Context, target Target) ([]Score, string /* rationale */, error)
}

// Registry holds registered evaluators and persists evaluations.
type Registry struct {
	St  *store.Store
	Bus *event.Bus

	mu        sync.RWMutex
	evaluators map[string]Evaluator
}

// NewRegistry returns a registry bound to the harness store.
func NewRegistry(st *store.Store, bus *event.Bus) *Registry {
	return &Registry{St: st, Bus: bus, evaluators: map[string]Evaluator{}}
}

// Register adds an evaluator. The evaluator's row is upserted into the
// evaluators table so dashboards can list it.
func (r *Registry) Register(ctx context.Context, e Evaluator) error {
	r.mu.Lock()
	r.evaluators[e.ID()] = e
	r.mu.Unlock()
	return r.St.Tx(ctx, func(q store.Querier) error {
		_, err := q.Exec(
			`INSERT INTO evaluators(id, kind, enabled) VALUES(?, ?, 1)
			 ON CONFLICT(id) DO UPDATE SET kind=excluded.kind, enabled=1`,
			e.ID(), string(e.Kind()),
		)
		return err
	})
}

// EvaluateAll runs every registered (enabled) evaluator on target and persists
// the evaluations. Returns the rows inserted.
func (r *Registry) EvaluateAll(ctx context.Context, target Target) ([]string, error) {
	r.mu.RLock()
	ids := make([]string, 0, len(r.evaluators))
	for id := range r.evaluators {
		ids = append(ids, id)
	}
	r.mu.RUnlock()

	var out []string
	for _, id := range ids {
		r.mu.RLock()
		e := r.evaluators[id]
		r.mu.RUnlock()
		scores, rationale, err := e.Evaluate(ctx, target)
		if err != nil {
			// Log but don't abort other evaluators.
			_, _ = r.Bus.Emit(ctx, event.Event{
				Kind: event.KindPolicyViolation,
				Payload: map[string]any{"evaluator": id, "error": err.Error()},
			})
			continue
		}
		evalID, err := r.persist(ctx, e, target, scores, rationale)
		if err != nil {
			return out, err
		}
		out = append(out, evalID)
		// V65: include RunID on evaluation.recorded so dashboards can
		// filter/group evaluations by run. Without this, subscribers had
		// to round-trip through tasks→runs to attribute scores.
		runID := target.RunID
		if runID == "" && target.Kind == TargetTask && target.ID != "" {
			_ = r.St.DB().QueryRowContext(ctx,
				`SELECT IFNULL(run_id,'') FROM tasks WHERE id=?`, target.ID,
			).Scan(&runID)
		}
		_, _ = r.Bus.Emit(ctx, event.Event{
			Kind:  event.KindEvaluationRecorded,
			RunID: runID,
			TaskID: target.ID,
			Payload: map[string]any{
				"evaluator": id, "target_kind": string(target.Kind),
				"target_id": target.ID, "scores": scoresMap(scores),
			},
		})
	}
	return out, nil
}

func (r *Registry) persist(ctx context.Context, e Evaluator, target Target, scores []Score, rationale string) (string, error) {
	scj, _ := json.Marshal(scoresMap(scores))
	evalID := fmt.Sprintf("ev_%s_%s_%d", e.ID(), target.ID, store.Now().UnixNano())
	err := r.St.Tx(ctx, func(q store.Querier) error {
		_, err := q.Exec(
			`INSERT INTO evaluations(id, target_kind, target_id, evaluator_id, scores_json, rationale_path, created_at) VALUES(?, ?, ?, ?, ?, NULL, ?)`,
			evalID, string(target.Kind), target.ID, e.ID(), string(scj),
			store.FmtTime(store.Now()),
		)
		return err
	})
	return evalID, err
}

func scoresMap(scs []Score) map[string]any {
	out := map[string]any{}
	for _, s := range scs {
		out[s.Dimension] = map[string]any{"value": s.Value, "rationale": s.Rationale}
	}
	return out
}

// RecordUserRating attaches a ground-truth user rating to a target and
// computes calibration (agreed_with_user) for every existing LLM-judge or
// heuristic evaluation of the same target.
//
// "Agreement" is defined here as |evaluator-score - user-score| ≤ 0.2 on
// the primary "overall" dimension (or, if absent, the mean of all
// dimensions). This is the calibration loop described in plan §10.4.
func (r *Registry) RecordUserRating(ctx context.Context, target Target, userScore float64) error {
	// Insert a user-rating evaluation row.
	body := map[string]any{"overall": map[string]any{"value": userScore, "rationale": "user"}}
	bj, _ := json.Marshal(body)
	userEvalID := fmt.Sprintf("ev_user_%s_%d", target.ID, store.Now().UnixNano())
	err := r.St.Tx(ctx, func(q store.Querier) error {
		_, err := q.Exec(
			`INSERT INTO evaluators(id, kind, enabled) VALUES('user', 'user-rating', 1)
			 ON CONFLICT(id) DO UPDATE SET enabled=1`,
		)
		if err != nil {
			return err
		}
		_, err = q.Exec(
			`INSERT INTO evaluations(id, target_kind, target_id, evaluator_id, scores_json, agreed_with_user, created_at)
			 VALUES(?, ?, ?, 'user', ?, 1, ?)`,
			userEvalID, string(target.Kind), target.ID, string(bj), store.FmtTime(store.Now()),
		)
		return err
	})
	if err != nil {
		return err
	}

	// Compute agreement for existing non-user evaluations of this target.
	rows, err := r.St.DB().QueryContext(ctx,
		`SELECT id, scores_json FROM evaluations WHERE target_kind=? AND target_id=? AND evaluator_id<>'user'`,
		string(target.Kind), target.ID,
	)
	if err != nil {
		return err
	}
	defer rows.Close()
	type pair struct {
		id    string
		agree int
	}
	var updates []pair
	for rows.Next() {
		var id, scj string
		if err := rows.Scan(&id, &scj); err != nil {
			return err
		}
		var m map[string]any
		_ = json.Unmarshal([]byte(scj), &m)
		score := overallScore(m)
		agree := 0
		if abs(score-userScore) <= 0.2 {
			agree = 1
		}
		updates = append(updates, pair{id, agree})
	}
	return r.St.Tx(ctx, func(q store.Querier) error {
		for _, u := range updates {
			if _, err := q.Exec(`UPDATE evaluations SET agreed_with_user=? WHERE id=?`, u.agree, u.id); err != nil {
				return err
			}
		}
		return nil
	})
}

func overallScore(m map[string]any) float64 {
	if v, ok := m["overall"]; ok {
		if om, ok := v.(map[string]any); ok {
			if f, ok := om["value"].(float64); ok {
				return f
			}
		}
	}
	// Mean of all dimensions
	if len(m) == 0 {
		return 0
	}
	sum := 0.0
	n := 0
	for _, v := range m {
		if om, ok := v.(map[string]any); ok {
			if f, ok := om["value"].(float64); ok {
				sum += f
				n++
			}
		}
	}
	if n == 0 {
		return 0
	}
	return sum / float64(n)
}

func abs(x float64) float64 {
	if x < 0 {
		return -x
	}
	return x
}

// --- Concrete evaluators ---

// HeuristicEvaluator runs deterministic checks on a target body/path. For
// Phase E we ship one with two dimensions:
//   - "length" — body length normalized to 0..1 (clamped at 1000 chars)
//   - "structure" — checks the body has markdown headings (≥1 = 1.0)
//
// Later phases plug in linters/test runners. The interface stays identical.
type HeuristicEvaluator struct{ Name string }

func (h HeuristicEvaluator) ID() string { return h.Name }
func (h HeuristicEvaluator) Kind() Kind { return KindHeuristic }
func (h HeuristicEvaluator) Evaluate(ctx context.Context, target Target) ([]Score, string, error) {
	body := target.Body
	if body == "" && target.Path != "" {
		// Phase E: we don't read disk here; tests pass Body directly.
		return nil, "", errors.New("heuristic: no body provided")
	}
	scores := []Score{
		{Dimension: "length", Value: clamp01(float64(len(body)) / 1000), Rationale: fmt.Sprintf("len=%d", len(body))},
		{Dimension: "structure", Value: structureScore(body), Rationale: "markdown headings"},
	}
	scores = append(scores, Score{
		Dimension: "overall",
		Value:     (scores[0].Value + scores[1].Value) / 2,
		Rationale: "mean of length+structure",
	})
	return scores, "ok", nil
}

func structureScore(body string) float64 {
	if strings.Contains(body, "\n#") || strings.HasPrefix(body, "#") {
		return 1.0
	}
	return 0.3
}

func clamp01(v float64) float64 {
	if v < 0 {
		return 0
	}
	if v > 1 {
		return 1
	}
	return v
}

// LLMJudge is a scripted/llm-backed evaluator. For Phase E we accept a
// callback so tests can plug in a deterministic judge; later phases pass a
// real LLM call.
type LLMJudge struct {
	Name     string
	Callback func(ctx context.Context, target Target) ([]Score, string, error)
}

func (l LLMJudge) ID() string { return l.Name }
func (l LLMJudge) Kind() Kind { return KindLLMJudge }
func (l LLMJudge) Evaluate(ctx context.Context, target Target) ([]Score, string, error) {
	if l.Callback == nil {
		return nil, "", errors.New("llm-judge: no callback")
	}
	return l.Callback(ctx, target)
}