Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
241
Skill
n/a
Size
2.6 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
examples/llm-as-judge-skills/tests/evaluation.test.ts

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code234 linesFree
examples/llm-as-judge-skills/tests/evaluation.test.ts
1import { describe, it, expect, beforeAll } from 'vitest';
2import { 
3  executeDirectScore, 
4  executePairwiseCompare, 
5  executeGenerateRubric,
6  EvaluatorAgent
7} from '../src/index.js';
8import { validateConfig } from '../src/config/index.js';
9 
10// Test fixtures
11const TEST_PROMPT = 'Explain quantum entanglement to a high school student';
12 
13const GOOD_RESPONSE = `Quantum entanglement is like having two magical coins that are connected in a special way. 
14When you flip one coin and it lands on heads, the other coin will instantly show tails, 
15no matter how far apart they are - even if one is on Earth and one is on Mars.
16 
17Here's what makes it special:
181. The connection is instantaneous - faster than light
192. You can't predict which side either coin will land on
203. But once you see one, you know exactly what the other shows
21 
22Scientists like Einstein called this "spooky action at a distance" because it seems impossible, 
23but experiments have proven it's real. This phenomenon is now being used to develop 
24super-secure communication systems and quantum computers.`;
25 
26const POOR_RESPONSE = `Quantum entanglement is when particles are connected. 
27It's complicated physics stuff. Scientists study it.`;
28 
29const MEDIUM_RESPONSE = `Quantum entanglement happens when two particles become linked together. 
30When you measure one particle, you instantly know something about the other particle, 
31even if they're far apart. It's used in quantum computing research.`;
32 
33// Validate config once before all tests
34beforeAll(() => {
35  validateConfig();
36});
37 
38describe('Direct Score Tool', () => {
39  it('should score a response against criteria', async () => {
40    const result = await executeDirectScore({
41      response: GOOD_RESPONSE,
42      prompt: TEST_PROMPT,
43      criteria: [
44        {
45          name: 'Accuracy',
46          description: 'Scientific correctness of the explanation',
47          weight: 0.4
48        },
49        {
50          name: 'Clarity',
51          description: 'Understandable for a high school student',
52          weight: 0.3
53        },
54        {
55          name: 'Engagement',
56          description: 'Interesting and memorable',
57          weight: 0.3
58        }
59      ],
60      rubric: {
61        scale: '1-5'
62      }
63    });
64 
65    expect(result.success).toBe(true);
66    expect(result.scores).toHaveLength(3);
67    expect(result.overallScore).toBeGreaterThan(0);
68    expect(result.overallScore).toBeLessThanOrEqual(5);
69    expect(result.metadata.criteriaCount).toBe(3);
70    
71    // Good response should score reasonably well
72    expect(result.overallScore).toBeGreaterThanOrEqual(3);
73  }, 60000);
74 
75  it('should provide lower scores for poor responses', async () => {
76    const goodResult = await executeDirectScore({
77      response: GOOD_RESPONSE,
78      prompt: TEST_PROMPT,
79      criteria: [
80        { name: 'Quality', description: 'Overall quality', weight: 1 }
81      ]
82    });
83 
84    const poorResult = await executeDirectScore({
85      response: POOR_RESPONSE,
86      prompt: TEST_PROMPT,
87      criteria: [
88        { name: 'Quality', description: 'Overall quality', weight: 1 }
89      ]
90    });
91 
92    expect(goodResult.success).toBe(true);
93    expect(poorResult.success).toBe(true);
94    expect(goodResult.overallScore).toBeGreaterThan(poorResult.overallScore);
95  }, 120000);
96});
97 
98describe('Pairwise Compare Tool', () => {
99  it('should correctly identify the better response', async () => {
100    const result = await executePairwiseCompare({
101      responseA: GOOD_RESPONSE,
102      responseB: POOR_RESPONSE,
103      prompt: TEST_PROMPT,
104      criteria: ['accuracy', 'clarity', 'completeness', 'engagement'],
105      allowTie: true,
106      swapPositions: true
107    });
108 
109    expect(result.success).toBe(true);
110    expect(result.winner).toBe('A');
111    expect(result.confidence).toBeGreaterThan(0.5);
112  }, 120000);
113 
114  it('should handle similar responses appropriately', async () => {
115    const result = await executePairwiseCompare({
116      responseA: MEDIUM_RESPONSE,
117      responseB: MEDIUM_RESPONSE,
118      prompt: TEST_PROMPT,
119      criteria: ['quality'],
120      allowTie: true,
121      swapPositions: true
122    });
123 
124    expect(result.success).toBe(true);
125    // Same response should tie
126    expect(result.winner).toBe('TIE');
127  }, 120000);
128 
129  it('should provide comparison details for each criterion', async () => {
130    const result = await executePairwiseCompare({
131      responseA: GOOD_RESPONSE,
132      responseB: MEDIUM_RESPONSE,
133      prompt: TEST_PROMPT,
134      criteria: ['accuracy', 'completeness'],
135      allowTie: true,
136      swapPositions: false
137    });
138 
139    expect(result.success).toBe(true);
140    expect(result.comparison).toHaveLength(2);
141    result.comparison.forEach(c => {
142      expect(c.criterion).toBeDefined();
143      expect(['A', 'B', 'TIE']).toContain(c.winner);
144      expect(c.reasoning).toBeDefined();
145    });
146  }, 60000);
147});
148 
149describe('Generate Rubric Tool', () => {
150  it('should generate a complete rubric', async () => {
151    const result = await executeGenerateRubric({
152      criterionName: 'Factual Accuracy',
153      criterionDescription: 'How factually correct is the content',
154      scale: '1-5',
155      domain: 'educational content',
156      includeExamples: true,
157      strictness: 'balanced'
158    });
159 
160    expect(result.success).toBe(true);
161    expect(result.levels).toHaveLength(5);
162    expect(result.scale.min).toBe(1);
163    expect(result.scale.max).toBe(5);
164    expect(result.scoringGuidelines.length).toBeGreaterThan(0);
165    expect(result.edgeCases.length).toBeGreaterThan(0);
166 
167    // Check level structure
168    result.levels.forEach(level => {
169      expect(level.score).toBeGreaterThanOrEqual(1);
170      expect(level.score).toBeLessThanOrEqual(5);
171      expect(level.label).toBeDefined();
172      expect(level.description).toBeDefined();
173      expect(level.characteristics.length).toBeGreaterThan(0);
174    });
175  }, 60000);
176 
177  it('should respect strictness setting', async () => {
178    const lenient = await executeGenerateRubric({
179      criterionName: 'Code Quality',
180      criterionDescription: 'Quality of code implementation',
181      scale: '1-5',
182      includeExamples: false,
183      strictness: 'lenient'
184    });
185 
186    const strict = await executeGenerateRubric({
187      criterionName: 'Code Quality',
188      criterionDescription: 'Quality of code implementation',
189      scale: '1-5',
190      includeExamples: false,
191      strictness: 'strict'
192    });
193 
194    expect(lenient.success).toBe(true);
195    expect(strict.success).toBe(true);
196    expect(lenient.metadata.strictness).toBe('lenient');
197    expect(strict.metadata.strictness).toBe('strict');
198  }, 120000);
199});
200 
201describe('Evaluator Agent', () => {
202  let agent: EvaluatorAgent;
203 
204  beforeAll(() => {
205    agent = new EvaluatorAgent();
206  });
207 
208  it('should provide integrated evaluation workflow', async () => {
209    const result = await agent.evaluateWithGeneratedRubric(
210      GOOD_RESPONSE,
211      TEST_PROMPT,
212      [
213        { name: 'Accuracy', description: 'Scientific correctness' },
214        { name: 'Accessibility', description: 'Appropriate for audience' }
215      ]
216    );
217 
218    expect(result.success).toBe(true);
219    expect(result.scores.length).toBeGreaterThan(0);
220  }, 120000);
221 
222  it('should support chat-based evaluation', async () => {
223    const result = await agent.chat(`
224      Please evaluate this response for accuracy:
225      
226      Question: What is photosynthesis?
227      Response: Photosynthesis is how plants make food using sunlight, water, and carbon dioxide.
228    `);
229 
230    expect(result.text).toBeDefined();
231    expect(result.text.length).toBeGreaterThan(50);
232  }, 60000);
233});
234
Preparing the source view

Agent Skills for Context Engineering

examples/llm-as-judge-skills/tests/evaluation.test.ts