Testing

Unit Testing Evaluators

Before deploying evaluators to production, test them locally against known inputs to make sure they score correctly.

Testing CONTAINS evaluator

import { describe, it, expect } from "vitest";
import { containsEvaluator } from "@2signal/eval-engine";

describe("mentions-pricing evaluator", () => {
  const config = {
    value: ["pricing", "cost", "$"],
    mode: "any" as const,
    case_sensitive: false,
  };

  it("passes when output mentions pricing", () => {
    const result = containsEvaluator.evaluate(
      { output: "Our pricing starts at $10/month" },
      config,
    );
    expect(result.score).toBe(1);
  });

  it("fails when output has no pricing info", () => {
    const result = containsEvaluator.evaluate(
      { output: "Thanks for reaching out! How can I help?" },
      config,
    );
    expect(result.score).toBe(0);
  });
});

Testing REGEX_MATCH evaluator

import { describe, it, expect } from "vitest";
import { regexMatchEvaluator } from "@2signal/eval-engine";

describe("email validation evaluator", () => {
  const config = {
    pattern: "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
  };

  it("passes when output contains email", () => {
    const result = regexMatchEvaluator.evaluate(
      { output: "Contact us at support@example.com for help." },
      config,
    );
    expect(result.score).toBe(1);
  });

  it("fails when no email present", () => {
    const result = regexMatchEvaluator.evaluate(
      { output: "Please visit our website for more info." },
      config,
    );
    expect(result.score).toBe(0);
  });
});

Testing JSON_SCHEMA evaluator

import { describe, it, expect } from "vitest";
import { jsonSchemaEvaluator } from "@2signal/eval-engine";

describe("response format evaluator", () => {
  const config = {
    schema: {
      type: "object",
      required: ["answer", "confidence"],
      properties: {
        answer: { type: "string", minLength: 1 },
        confidence: { type: "number", minimum: 0, maximum: 1 },
      },
    },
  };

  it("passes for valid JSON", () => {
    const result = jsonSchemaEvaluator.evaluate(
      { output: '{"answer": "Yes", "confidence": 0.95}' },
      config,
    );
    expect(result.score).toBe(1);
  });

  it("fails for missing required field", () => {
    const result = jsonSchemaEvaluator.evaluate(
      { output: '{"answer": "Yes"}' },
      config,
    );
    expect(result.score).toBe(0);
  });

  it("fails for non-JSON output", () => {
    const result = jsonSchemaEvaluator.evaluate(
      { output: "This is just plain text" },
      config,
    );
    expect(result.score).toBe(0);
  });
});

Testing LATENCY and COST evaluators

import { describe, it, expect } from "vitest";
import { latencyEvaluator, costEvaluator } from "@2signal/eval-engine";

describe("latency evaluator", () => {
  const config = { max_ms: 5000, target_ms: 2000 };

  it("scores 1.0 for fast responses", () => {
    const result = latencyEvaluator.evaluate({ latency_ms: 1500 }, config);
    expect(result.score).toBe(1);
  });

  it("scores between 0-1 for moderate responses", () => {
    const result = latencyEvaluator.evaluate({ latency_ms: 3500 }, config);
    expect(result.score).toBeGreaterThan(0);
    expect(result.score).toBeLessThan(1);
  });

  it("scores 0 for responses exceeding max", () => {
    const result = latencyEvaluator.evaluate({ latency_ms: 6000 }, config);
    expect(result.score).toBe(0);
  });
});

Testing SIMILARITY evaluator

import { describe, it, expect } from "vitest";
import { similarityEvaluator } from "@2signal/eval-engine";

describe("similarity evaluator", () => {
  const config = { threshold: 0.7 };

  it("scores high for similar outputs", () => {
    const result = similarityEvaluator.evaluate(
      {
        output: "The weather today is sunny and warm",
        expected: "Today's weather is sunny and warm outside",
      },
      config,
    );
    expect(result.score).toBeGreaterThan(0.7);
  });

  it("scores low for dissimilar outputs", () => {
    const result = similarityEvaluator.evaluate(
      {
        output: "The cat sat on the mat",
        expected: "Quantum physics is a branch of science",
      },
      config,
    );
    expect(result.score).toBeLessThan(0.3);
  });
});

Running tests

# Run all evaluator tests
pnpm --filter eval-engine test

# Run a specific test file
pnpm --filter eval-engine test evaluators/contains.test.ts

# Watch mode during development
pnpm --filter eval-engine test --watch

Tips

Test edge cases: empty strings, very long inputs, special characters, unicode.
Test the negate flag for Contains and Regex evaluators.
For LLM Judge, mock the OpenAI client in tests to avoid API costs and flakiness.
Keep test datasets small and focused — each test should validate one behavior.

Have questions? Join our community.

Connect with other developers and the 2Signal team.

Join Discord