Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/llm-as-judge-skills/tests/skills.test.ts
1import { describe, it, expect, beforeAll, beforeEach } from 'vitest';2import { EvaluatorAgent } from '../src/agents/evaluator.js';3import { validateConfig } from '../src/config/index.js';45/**6* Tests for skills implementation based on LLM-as-a-Judge research7*/89describe('LLM Evaluator Skill Tests', () => {10let agent: EvaluatorAgent;1112beforeAll(() => {13validateConfig();14});1516beforeEach(() => {17agent = new EvaluatorAgent();18});1920describe('Direct Scoring Skill', () => {21it('should use chain-of-thought in scoring', async () => {22const result = await agent.score({23response: 'Machine learning is a type of artificial intelligence that allows computers to learn from data.',24prompt: 'Define machine learning',25criteria: [26{ name: 'Accuracy', description: 'Factual correctness', weight: 1 }27]28});2930expect(result.success).toBe(true);31// Should have justification (evidence of CoT)32if (result.scores.length > 0) {33expect(result.scores[0].justification.length).toBeGreaterThan(20);34}35}, 60000);3637it('should handle multiple weighted criteria', async () => {38const result = await agent.score({39response: 'The mitochondria is the powerhouse of the cell. It produces ATP.',40prompt: 'Explain the function of mitochondria',41criteria: [42{ name: 'Accuracy', description: 'Scientific correctness', weight: 0.5 },43{ name: 'Completeness', description: 'Covers key points', weight: 0.3 },44{ name: 'Clarity', description: 'Easy to understand', weight: 0.2 }45]46});4748expect(result.success).toBe(true);49expect(result.scores).toHaveLength(3);50expect(result.weightedScore).toBeDefined();51}, 60000);52});5354describe('Pairwise Comparison Skill', () => {55it('should mitigate position bias with swap', async () => {56const response1 = 'Water boils at 100 degrees Celsius at sea level.';57const response2 = 'Water boils at 100°C (212°F) at standard atmospheric pressure (sea level).';5859const result = await agent.compare({60responseA: response1,61responseB: response2,62prompt: 'At what temperature does water boil?',63criteria: ['accuracy', 'completeness'],64allowTie: true,65swapPositions: true66});6768expect(result.success).toBe(true);69expect(result.positionConsistency).toBeDefined();70}, 120000);7172it('should identify clear winner for quality difference', async () => {73const good = `The Earth revolves around the Sun in an elliptical orbit,74taking approximately 365.25 days to complete one revolution.75This is why we have leap years every 4 years.`;7677const poor = 'The earth goes around the sun.';7879const result = await agent.compare({80responseA: good,81responseB: poor,82prompt: 'How does the Earth orbit the Sun?',83criteria: ['completeness', 'accuracy', 'detail'],84allowTie: true,85swapPositions: true86});8788expect(result.success).toBe(true);89expect(result.winner).toBe('A');90expect(result.confidence).toBeGreaterThan(0.5);91}, 120000);92});9394describe('Rubric Generation Skill', () => {95it('should generate domain-specific rubrics', async () => {96const result = await agent.generateRubric({97criterionName: 'Code Readability',98criterionDescription: 'How easy the code is to understand and maintain',99scale: '1-5',100domain: 'software engineering',101includeExamples: true,102strictness: 'balanced'103});104105expect(result.success).toBe(true);106expect(result.levels.length).toBe(5);107expect(result.metadata.domain).toBe('software engineering');108109// Should have code-specific terminology110const allText = result.levels.map(l => l.description + l.characteristics.join(' ')).join(' ');111expect(allText.toLowerCase()).toMatch(/variable|function|comment|name|structure|code|read/i);112}, 60000);113114it('should provide edge case guidance', async () => {115const result = await agent.generateRubric({116criterionName: 'Factual Accuracy',117criterionDescription: 'Whether claims are factually correct',118scale: '1-5',119includeExamples: false,120strictness: 'strict'121});122123expect(result.success).toBe(true);124expect(result.edgeCases.length).toBeGreaterThan(0);125result.edgeCases.forEach(ec => {126expect(ec.situation).toBeDefined();127expect(ec.guidance).toBeDefined();128});129}, 60000);130});131132describe('Context Fundamentals Skill Application', () => {133it('should utilize provided context in evaluation', async () => {134const context = `The user is a medical professional asking about drug interactions.135Technical terminology is appropriate.`;136137const result = await agent.score({138response: 'Combining SSRIs with MAOIs can lead to serotonin syndrome, a potentially life-threatening condition.',139prompt: 'What are the risks of combining antidepressants?',140context,141criteria: [142{ name: 'Accuracy', description: 'Medical accuracy', weight: 0.5 },143{ name: 'Appropriateness', description: 'Appropriate for audience', weight: 0.5 }144]145});146147expect(result.success).toBe(true);148// Technical response should score well given medical context149expect(result.overallScore).toBeGreaterThanOrEqual(2);150}, 60000);151});152});153154describe('Skill Input/Output Validation', () => {155let agent: EvaluatorAgent;156157beforeAll(() => {158validateConfig();159agent = new EvaluatorAgent();160});161162it('should validate DirectScore input schema', async () => {163const result = await agent.score({164response: 'Test response',165prompt: 'Test prompt',166criteria: [{ name: 'Test', description: 'Test criterion', weight: 1 }]167});168169expect(result).toHaveProperty('success');170expect(result).toHaveProperty('scores');171expect(result).toHaveProperty('overallScore');172expect(result).toHaveProperty('summary');173expect(result).toHaveProperty('metadata');174}, 60000);175176it('should validate PairwiseCompare output structure', async () => {177const result = await agent.compare({178responseA: 'Response A content',179responseB: 'Response B content',180prompt: 'Test prompt',181criteria: ['quality'],182allowTie: true,183swapPositions: false184});185186expect(result).toHaveProperty('success');187expect(result).toHaveProperty('winner');188expect(['A', 'B', 'TIE']).toContain(result.winner);189expect(result).toHaveProperty('confidence');190expect(result.confidence).toBeGreaterThanOrEqual(0);191expect(result.confidence).toBeLessThanOrEqual(1);192expect(result).toHaveProperty('comparison');193expect(result).toHaveProperty('metadata');194}, 60000);195196it('should validate GenerateRubric output structure', async () => {197const result = await agent.generateRubric({198criterionName: 'Test',199criterionDescription: 'Test criterion',200scale: '1-5',201includeExamples: false,202strictness: 'balanced'203});204205expect(result).toHaveProperty('success');206expect(result).toHaveProperty('criterion');207expect(result).toHaveProperty('scale');208expect(result).toHaveProperty('levels');209expect(result).toHaveProperty('scoringGuidelines');210expect(result).toHaveProperty('edgeCases');211expect(result).toHaveProperty('metadata');212}, 60000);213});214