Testing

Unit Testing Evaluators

Before deploying evaluators to production, test them locally against known inputs to make sure they score correctly.

Testing CONTAINS evaluator

import { describe, it, expect } from "vitest";
import { containsEvaluator } from "@2signal/eval-engine";

describe("mentions-pricing evaluator", () => {
  const config = {
    value: ["pricing", "cost", "$"],
    mode: "any" as const,
    case_sensitive: false,
  };

  it("passes when output mentions pricing", () => {
    const result = containsEvaluator.evaluate(
      { output: "Our pricing starts at $10/month" },
      config,
    );
    expect(result.score).toBe(1);
  });

  it("fails when output has no pricing info", () => {
    const result = containsEvaluator.evaluate(
      { output: "Thanks for reaching out! How can I help?" },
      config,
    );
    expect(result.score).toBe(0);
  });
});

Testing REGEX_MATCH evaluator

import { describe, it, expect } from "vitest";
import { regexMatchEvaluator } from "@2signal/eval-engine";

describe("email validation evaluator", () => {
  const config = {
    pattern: "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
  };

  it("passes when output contains email", () => {
    const result = regexMatchEvaluator.evaluate(
      { output: "Contact us at support@example.com for help." },
      config,
    );
    expect(result.score).toBe(1);
  });

  it("fails when no email present", () => {
    const result = regexMatchEvaluator.evaluate(
      { output: "Please visit our website for more info." },
      config,
    );
    expect(result.score).toBe(0);
  });
});

Testing JSON_SCHEMA evaluator

import { describe, it, expect } from "vitest";
import { jsonSchemaEvaluator } from "@2signal/eval-engine";

describe("response format evaluator", () => {
  const config = {
    schema: {
      type: "object",
      required: ["answer", "confidence"],
      properties: {
        answer: { type: "string", minLength: 1 },
        confidence: { type: "number", minimum: 0, maximum: 1 },
      },
    },
  };

  it("passes for valid JSON", () => {
    const result = jsonSchemaEvaluator.evaluate(
      { output: '{"answer": "Yes", "confidence": 0.95}' },
      config,
    );
    expect(result.score).toBe(1);
  });

  it("fails for missing required field", () => {
    const result = jsonSchemaEvaluator.evaluate(
      { output: '{"answer": "Yes"}' },
      config,
    );
    expect(result.score).toBe(0);
  });

  it("fails for non-JSON output", () => {
    const result = jsonSchemaEvaluator.evaluate(
      { output: "This is just plain text" },
      config,
    );
    expect(result.score).toBe(0);
  });
});

Testing LATENCY and COST evaluators

import { describe, it, expect } from "vitest";
import { latencyEvaluator, costEvaluator } from "@2signal/eval-engine";

describe("latency evaluator", () => {
  const config = { max_ms: 5000, target_ms: 2000 };

  it("scores 1.0 for fast responses", () => {
    const result = latencyEvaluator.evaluate({ latency_ms: 1500 }, config);
    expect(result.score).toBe(1);
  });

  it("scores between 0-1 for moderate responses", () => {
    const result = latencyEvaluator.evaluate({ latency_ms: 3500 }, config);
    expect(result.score).toBeGreaterThan(0);
    expect(result.score).toBeLessThan(1);
  });

  it("scores 0 for responses exceeding max", () => {
    const result = latencyEvaluator.evaluate({ latency_ms: 6000 }, config);
    expect(result.score).toBe(0);
  });
});

Testing SIMILARITY evaluator

import { describe, it, expect } from "vitest";
import { similarityEvaluator } from "@2signal/eval-engine";

describe("similarity evaluator", () => {
  const config = { threshold: 0.7 };

  it("scores high for similar outputs", () => {
    const result = similarityEvaluator.evaluate(
      {
        output: "The weather today is sunny and warm",
        expected: "Today's weather is sunny and warm outside",
      },
      config,
    );
    expect(result.score).toBeGreaterThan(0.7);
  });

  it("scores low for dissimilar outputs", () => {
    const result = similarityEvaluator.evaluate(
      {
        output: "The cat sat on the mat",
        expected: "Quantum physics is a branch of science",
      },
      config,
    );
    expect(result.score).toBeLessThan(0.3);
  });
});

Running tests

# Run all evaluator tests
pnpm --filter eval-engine test

# Run a specific test file
pnpm --filter eval-engine test evaluators/contains.test.ts

# Watch mode during development
pnpm --filter eval-engine test --watch

Tips

  • Test edge cases: empty strings, very long inputs, special characters, unicode.
  • Test the negate flag for Contains and Regex evaluators.
  • For LLM Judge, mock the OpenAI client in tests to avoid API costs and flakiness.
  • Keep test datasets small and focused — each test should validate one behavior.

Have questions? Join our community!

Connect with other developers and the 2Signal team.

Join Discord