Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/llm-as-judge-skills/tests/evaluation.test.ts
1import { describe, it, expect, beforeAll } from 'vitest';2import {3executeDirectScore,4executePairwiseCompare,5executeGenerateRubric,6EvaluatorAgent7} from '../src/index.js';8import { validateConfig } from '../src/config/index.js';910// Test fixtures11const TEST_PROMPT = 'Explain quantum entanglement to a high school student';1213const GOOD_RESPONSE = `Quantum entanglement is like having two magical coins that are connected in a special way.14When you flip one coin and it lands on heads, the other coin will instantly show tails,15no matter how far apart they are - even if one is on Earth and one is on Mars.1617Here's what makes it special:181. The connection is instantaneous - faster than light192. You can't predict which side either coin will land on203. But once you see one, you know exactly what the other shows2122Scientists like Einstein called this "spooky action at a distance" because it seems impossible,23but experiments have proven it's real. This phenomenon is now being used to develop24super-secure communication systems and quantum computers.`;2526const POOR_RESPONSE = `Quantum entanglement is when particles are connected.27It's complicated physics stuff. Scientists study it.`;2829const MEDIUM_RESPONSE = `Quantum entanglement happens when two particles become linked together.30When you measure one particle, you instantly know something about the other particle,31even if they're far apart. It's used in quantum computing research.`;3233// Validate config once before all tests34beforeAll(() => {35validateConfig();36});3738describe('Direct Score Tool', () => {39it('should score a response against criteria', async () => {40const result = await executeDirectScore({41response: GOOD_RESPONSE,42prompt: TEST_PROMPT,43criteria: [44{45name: 'Accuracy',46description: 'Scientific correctness of the explanation',47weight: 0.448},49{50name: 'Clarity',51description: 'Understandable for a high school student',52weight: 0.353},54{55name: 'Engagement',56description: 'Interesting and memorable',57weight: 0.358}59],60rubric: {61scale: '1-5'62}63});6465expect(result.success).toBe(true);66expect(result.scores).toHaveLength(3);67expect(result.overallScore).toBeGreaterThan(0);68expect(result.overallScore).toBeLessThanOrEqual(5);69expect(result.metadata.criteriaCount).toBe(3);7071// Good response should score reasonably well72expect(result.overallScore).toBeGreaterThanOrEqual(3);73}, 60000);7475it('should provide lower scores for poor responses', async () => {76const goodResult = await executeDirectScore({77response: GOOD_RESPONSE,78prompt: TEST_PROMPT,79criteria: [80{ name: 'Quality', description: 'Overall quality', weight: 1 }81]82});8384const poorResult = await executeDirectScore({85response: POOR_RESPONSE,86prompt: TEST_PROMPT,87criteria: [88{ name: 'Quality', description: 'Overall quality', weight: 1 }89]90});9192expect(goodResult.success).toBe(true);93expect(poorResult.success).toBe(true);94expect(goodResult.overallScore).toBeGreaterThan(poorResult.overallScore);95}, 120000);96});9798describe('Pairwise Compare Tool', () => {99it('should correctly identify the better response', async () => {100const result = await executePairwiseCompare({101responseA: GOOD_RESPONSE,102responseB: POOR_RESPONSE,103prompt: TEST_PROMPT,104criteria: ['accuracy', 'clarity', 'completeness', 'engagement'],105allowTie: true,106swapPositions: true107});108109expect(result.success).toBe(true);110expect(result.winner).toBe('A');111expect(result.confidence).toBeGreaterThan(0.5);112}, 120000);113114it('should handle similar responses appropriately', async () => {115const result = await executePairwiseCompare({116responseA: MEDIUM_RESPONSE,117responseB: MEDIUM_RESPONSE,118prompt: TEST_PROMPT,119criteria: ['quality'],120allowTie: true,121swapPositions: true122});123124expect(result.success).toBe(true);125// Same response should tie126expect(result.winner).toBe('TIE');127}, 120000);128129it('should provide comparison details for each criterion', async () => {130const result = await executePairwiseCompare({131responseA: GOOD_RESPONSE,132responseB: MEDIUM_RESPONSE,133prompt: TEST_PROMPT,134criteria: ['accuracy', 'completeness'],135allowTie: true,136swapPositions: false137});138139expect(result.success).toBe(true);140expect(result.comparison).toHaveLength(2);141result.comparison.forEach(c => {142expect(c.criterion).toBeDefined();143expect(['A', 'B', 'TIE']).toContain(c.winner);144expect(c.reasoning).toBeDefined();145});146}, 60000);147});148149describe('Generate Rubric Tool', () => {150it('should generate a complete rubric', async () => {151const result = await executeGenerateRubric({152criterionName: 'Factual Accuracy',153criterionDescription: 'How factually correct is the content',154scale: '1-5',155domain: 'educational content',156includeExamples: true,157strictness: 'balanced'158});159160expect(result.success).toBe(true);161expect(result.levels).toHaveLength(5);162expect(result.scale.min).toBe(1);163expect(result.scale.max).toBe(5);164expect(result.scoringGuidelines.length).toBeGreaterThan(0);165expect(result.edgeCases.length).toBeGreaterThan(0);166167// Check level structure168result.levels.forEach(level => {169expect(level.score).toBeGreaterThanOrEqual(1);170expect(level.score).toBeLessThanOrEqual(5);171expect(level.label).toBeDefined();172expect(level.description).toBeDefined();173expect(level.characteristics.length).toBeGreaterThan(0);174});175}, 60000);176177it('should respect strictness setting', async () => {178const lenient = await executeGenerateRubric({179criterionName: 'Code Quality',180criterionDescription: 'Quality of code implementation',181scale: '1-5',182includeExamples: false,183strictness: 'lenient'184});185186const strict = await executeGenerateRubric({187criterionName: 'Code Quality',188criterionDescription: 'Quality of code implementation',189scale: '1-5',190includeExamples: false,191strictness: 'strict'192});193194expect(lenient.success).toBe(true);195expect(strict.success).toBe(true);196expect(lenient.metadata.strictness).toBe('lenient');197expect(strict.metadata.strictness).toBe('strict');198}, 120000);199});200201describe('Evaluator Agent', () => {202let agent: EvaluatorAgent;203204beforeAll(() => {205agent = new EvaluatorAgent();206});207208it('should provide integrated evaluation workflow', async () => {209const result = await agent.evaluateWithGeneratedRubric(210GOOD_RESPONSE,211TEST_PROMPT,212[213{ name: 'Accuracy', description: 'Scientific correctness' },214{ name: 'Accessibility', description: 'Appropriate for audience' }215]216);217218expect(result.success).toBe(true);219expect(result.scores.length).toBeGreaterThan(0);220}, 120000);221222it('should support chat-based evaluation', async () => {223const result = await agent.chat(`224Please evaluate this response for accuracy:225226Question: What is photosynthesis?227Response: Photosynthesis is how plants make food using sunlight, water, and carbon dioxide.228`);229230expect(result.text).toBeDefined();231expect(result.text.length).toBeGreaterThan(50);232}, 60000);233});234