Testing
Unit Testing Evaluators
Before deploying evaluators to production, test them locally against known inputs to make sure they score correctly.
Testing CONTAINS evaluator
import { describe, it, expect } from "vitest";
import { containsEvaluator } from "@2signal/eval-engine";
describe("mentions-pricing evaluator", () => {
const config = {
value: ["pricing", "cost", "$"],
mode: "any" as const,
case_sensitive: false,
};
it("passes when output mentions pricing", () => {
const result = containsEvaluator.evaluate(
{ output: "Our pricing starts at $10/month" },
config,
);
expect(result.score).toBe(1);
});
it("fails when output has no pricing info", () => {
const result = containsEvaluator.evaluate(
{ output: "Thanks for reaching out! How can I help?" },
config,
);
expect(result.score).toBe(0);
});
});Testing REGEX_MATCH evaluator
import { describe, it, expect } from "vitest";
import { regexMatchEvaluator } from "@2signal/eval-engine";
describe("email validation evaluator", () => {
const config = {
pattern: "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
};
it("passes when output contains email", () => {
const result = regexMatchEvaluator.evaluate(
{ output: "Contact us at support@example.com for help." },
config,
);
expect(result.score).toBe(1);
});
it("fails when no email present", () => {
const result = regexMatchEvaluator.evaluate(
{ output: "Please visit our website for more info." },
config,
);
expect(result.score).toBe(0);
});
});Testing JSON_SCHEMA evaluator
import { describe, it, expect } from "vitest";
import { jsonSchemaEvaluator } from "@2signal/eval-engine";
describe("response format evaluator", () => {
const config = {
schema: {
type: "object",
required: ["answer", "confidence"],
properties: {
answer: { type: "string", minLength: 1 },
confidence: { type: "number", minimum: 0, maximum: 1 },
},
},
};
it("passes for valid JSON", () => {
const result = jsonSchemaEvaluator.evaluate(
{ output: '{"answer": "Yes", "confidence": 0.95}' },
config,
);
expect(result.score).toBe(1);
});
it("fails for missing required field", () => {
const result = jsonSchemaEvaluator.evaluate(
{ output: '{"answer": "Yes"}' },
config,
);
expect(result.score).toBe(0);
});
it("fails for non-JSON output", () => {
const result = jsonSchemaEvaluator.evaluate(
{ output: "This is just plain text" },
config,
);
expect(result.score).toBe(0);
});
});Testing LATENCY and COST evaluators
import { describe, it, expect } from "vitest";
import { latencyEvaluator, costEvaluator } from "@2signal/eval-engine";
describe("latency evaluator", () => {
const config = { max_ms: 5000, target_ms: 2000 };
it("scores 1.0 for fast responses", () => {
const result = latencyEvaluator.evaluate({ latency_ms: 1500 }, config);
expect(result.score).toBe(1);
});
it("scores between 0-1 for moderate responses", () => {
const result = latencyEvaluator.evaluate({ latency_ms: 3500 }, config);
expect(result.score).toBeGreaterThan(0);
expect(result.score).toBeLessThan(1);
});
it("scores 0 for responses exceeding max", () => {
const result = latencyEvaluator.evaluate({ latency_ms: 6000 }, config);
expect(result.score).toBe(0);
});
});Testing SIMILARITY evaluator
import { describe, it, expect } from "vitest";
import { similarityEvaluator } from "@2signal/eval-engine";
describe("similarity evaluator", () => {
const config = { threshold: 0.7 };
it("scores high for similar outputs", () => {
const result = similarityEvaluator.evaluate(
{
output: "The weather today is sunny and warm",
expected: "Today's weather is sunny and warm outside",
},
config,
);
expect(result.score).toBeGreaterThan(0.7);
});
it("scores low for dissimilar outputs", () => {
const result = similarityEvaluator.evaluate(
{
output: "The cat sat on the mat",
expected: "Quantum physics is a branch of science",
},
config,
);
expect(result.score).toBeLessThan(0.3);
});
});Running tests
# Run all evaluator tests
pnpm --filter eval-engine test
# Run a specific test file
pnpm --filter eval-engine test evaluators/contains.test.ts
# Watch mode during development
pnpm --filter eval-engine test --watchTips
- Test edge cases: empty strings, very long inputs, special characters, unicode.
- Test the
negateflag for Contains and Regex evaluators. - For LLM Judge, mock the OpenAI client in tests to avoid API costs and flakiness.
- Keep test datasets small and focused — each test should validate one behavior.