package orchestrator_test

import (
	"testing"

	"github.com/flothus/tmux-xterm-research/server-go/internal/harness/orchestrator"
)

func TestPhaseJ_FalsifierAcceptsRealImprovement(t *testing.T) {
	c := orchestrator.DefaultFalsifier()
	c.PrimaryBenchmark = "decomposable-v1"
	res, err := orchestrator.EvaluateDiff(c,
		0.50, 0.65, // 30% improvement on primary
		map[string]float64{"cross-cutting-v1": 0.60, "underspecified-v1": 0.55},
		map[string]float64{"cross-cutting-v1": 0.61, "underspecified-v1": 0.54},
		3,    // sample runs
		0.85, // agreement
	)
	if err != nil {
		t.Fatal(err)
	}
	if !res.Pass {
		t.Errorf("expected pass; rationale: %s", res.Rationale)
	}
}

func TestPhaseJ_FalsifierRejectsRegressionElsewhere(t *testing.T) {
	c := orchestrator.DefaultFalsifier()
	c.PrimaryBenchmark = "decomposable-v1"
	res, err := orchestrator.EvaluateDiff(c,
		0.50, 0.65, // primary up
		map[string]float64{"cross-cutting-v1": 0.60},
		map[string]float64{"cross-cutting-v1": 0.40}, // big regression
		3, 0.85,
	)
	if err != nil {
		t.Fatal(err)
	}
	if res.Pass {
		t.Errorf("expected failure due to secondary regression; %s", res.Rationale)
	}
}

func TestPhaseJ_FalsifierRejectsLowAgreement(t *testing.T) {
	c := orchestrator.DefaultFalsifier()
	c.PrimaryBenchmark = "decomposable-v1"
	res, err := orchestrator.EvaluateDiff(c,
		0.50, 0.70,
		nil, nil, 3,
		0.50, // way below 0.80
	)
	if err != nil {
		t.Fatal(err)
	}
	if res.Pass {
		t.Errorf("expected failure due to low evaluator agreement; %s", res.Rationale)
	}
}

func TestPhaseJ_FalsifierRejectsSmallSample(t *testing.T) {
	c := orchestrator.DefaultFalsifier()
	c.PrimaryBenchmark = "decomposable-v1"
	res, err := orchestrator.EvaluateDiff(c,
		0.50, 0.70, nil, nil,
		1, // < 3
		0.85,
	)
	if err != nil {
		t.Fatal(err)
	}
	if res.Pass {
		t.Errorf("expected failure due to small sample; %s", res.Rationale)
	}
}

func TestPhaseJ_FalsifierRequiresPrimary(t *testing.T) {
	c := orchestrator.DefaultFalsifier()
	if _, err := orchestrator.EvaluateDiff(c, 0, 0, nil, nil, 3, 1.0); err == nil {
		t.Errorf("expected error when primary benchmark is empty")
	}
}
