Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
241
Skill
n/a
Size
2.6 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
examples/llm-as-judge-skills/tests/skills.test.ts

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code214 linesFree
examples/llm-as-judge-skills/tests/skills.test.ts
1import { describe, it, expect, beforeAll, beforeEach } from 'vitest';
2import { EvaluatorAgent } from '../src/agents/evaluator.js';
3import { validateConfig } from '../src/config/index.js';
4 
5/**
6 * Tests for skills implementation based on LLM-as-a-Judge research
7 */
8 
9describe('LLM Evaluator Skill Tests', () => {
10  let agent: EvaluatorAgent;
11 
12  beforeAll(() => {
13    validateConfig();
14  });
15 
16  beforeEach(() => {
17    agent = new EvaluatorAgent();
18  });
19 
20  describe('Direct Scoring Skill', () => {
21    it('should use chain-of-thought in scoring', async () => {
22      const result = await agent.score({
23        response: 'Machine learning is a type of artificial intelligence that allows computers to learn from data.',
24        prompt: 'Define machine learning',
25        criteria: [
26          { name: 'Accuracy', description: 'Factual correctness', weight: 1 }
27        ]
28      });
29 
30      expect(result.success).toBe(true);
31      // Should have justification (evidence of CoT)
32      if (result.scores.length > 0) {
33        expect(result.scores[0].justification.length).toBeGreaterThan(20);
34      }
35    }, 60000);
36 
37    it('should handle multiple weighted criteria', async () => {
38      const result = await agent.score({
39        response: 'The mitochondria is the powerhouse of the cell. It produces ATP.',
40        prompt: 'Explain the function of mitochondria',
41        criteria: [
42          { name: 'Accuracy', description: 'Scientific correctness', weight: 0.5 },
43          { name: 'Completeness', description: 'Covers key points', weight: 0.3 },
44          { name: 'Clarity', description: 'Easy to understand', weight: 0.2 }
45        ]
46      });
47 
48      expect(result.success).toBe(true);
49      expect(result.scores).toHaveLength(3);
50      expect(result.weightedScore).toBeDefined();
51    }, 60000);
52  });
53 
54  describe('Pairwise Comparison Skill', () => {
55    it('should mitigate position bias with swap', async () => {
56      const response1 = 'Water boils at 100 degrees Celsius at sea level.';
57      const response2 = 'Water boils at 100°C (212°F) at standard atmospheric pressure (sea level).';
58 
59      const result = await agent.compare({
60        responseA: response1,
61        responseB: response2,
62        prompt: 'At what temperature does water boil?',
63        criteria: ['accuracy', 'completeness'],
64        allowTie: true,
65        swapPositions: true
66      });
67 
68      expect(result.success).toBe(true);
69      expect(result.positionConsistency).toBeDefined();
70    }, 120000);
71 
72    it('should identify clear winner for quality difference', async () => {
73      const good = `The Earth revolves around the Sun in an elliptical orbit, 
74        taking approximately 365.25 days to complete one revolution. 
75        This is why we have leap years every 4 years.`;
76      
77      const poor = 'The earth goes around the sun.';
78 
79      const result = await agent.compare({
80        responseA: good,
81        responseB: poor,
82        prompt: 'How does the Earth orbit the Sun?',
83        criteria: ['completeness', 'accuracy', 'detail'],
84        allowTie: true,
85        swapPositions: true
86      });
87 
88      expect(result.success).toBe(true);
89      expect(result.winner).toBe('A');
90      expect(result.confidence).toBeGreaterThan(0.5);
91    }, 120000);
92  });
93 
94  describe('Rubric Generation Skill', () => {
95    it('should generate domain-specific rubrics', async () => {
96      const result = await agent.generateRubric({
97        criterionName: 'Code Readability',
98        criterionDescription: 'How easy the code is to understand and maintain',
99        scale: '1-5',
100        domain: 'software engineering',
101        includeExamples: true,
102        strictness: 'balanced'
103      });
104 
105      expect(result.success).toBe(true);
106      expect(result.levels.length).toBe(5);
107      expect(result.metadata.domain).toBe('software engineering');
108      
109      // Should have code-specific terminology
110      const allText = result.levels.map(l => l.description + l.characteristics.join(' ')).join(' ');
111      expect(allText.toLowerCase()).toMatch(/variable|function|comment|name|structure|code|read/i);
112    }, 60000);
113 
114    it('should provide edge case guidance', async () => {
115      const result = await agent.generateRubric({
116        criterionName: 'Factual Accuracy',
117        criterionDescription: 'Whether claims are factually correct',
118        scale: '1-5',
119        includeExamples: false,
120        strictness: 'strict'
121      });
122 
123      expect(result.success).toBe(true);
124      expect(result.edgeCases.length).toBeGreaterThan(0);
125      result.edgeCases.forEach(ec => {
126        expect(ec.situation).toBeDefined();
127        expect(ec.guidance).toBeDefined();
128      });
129    }, 60000);
130  });
131 
132  describe('Context Fundamentals Skill Application', () => {
133    it('should utilize provided context in evaluation', async () => {
134      const context = `The user is a medical professional asking about drug interactions.
135        Technical terminology is appropriate.`;
136 
137      const result = await agent.score({
138        response: 'Combining SSRIs with MAOIs can lead to serotonin syndrome, a potentially life-threatening condition.',
139        prompt: 'What are the risks of combining antidepressants?',
140        context,
141        criteria: [
142          { name: 'Accuracy', description: 'Medical accuracy', weight: 0.5 },
143          { name: 'Appropriateness', description: 'Appropriate for audience', weight: 0.5 }
144        ]
145      });
146 
147      expect(result.success).toBe(true);
148      // Technical response should score well given medical context
149      expect(result.overallScore).toBeGreaterThanOrEqual(2);
150    }, 60000);
151  });
152});
153 
154describe('Skill Input/Output Validation', () => {
155  let agent: EvaluatorAgent;
156 
157  beforeAll(() => {
158    validateConfig();
159    agent = new EvaluatorAgent();
160  });
161 
162  it('should validate DirectScore input schema', async () => {
163    const result = await agent.score({
164      response: 'Test response',
165      prompt: 'Test prompt',
166      criteria: [{ name: 'Test', description: 'Test criterion', weight: 1 }]
167    });
168    
169    expect(result).toHaveProperty('success');
170    expect(result).toHaveProperty('scores');
171    expect(result).toHaveProperty('overallScore');
172    expect(result).toHaveProperty('summary');
173    expect(result).toHaveProperty('metadata');
174  }, 60000);
175 
176  it('should validate PairwiseCompare output structure', async () => {
177    const result = await agent.compare({
178      responseA: 'Response A content',
179      responseB: 'Response B content',
180      prompt: 'Test prompt',
181      criteria: ['quality'],
182      allowTie: true,
183      swapPositions: false
184    });
185 
186    expect(result).toHaveProperty('success');
187    expect(result).toHaveProperty('winner');
188    expect(['A', 'B', 'TIE']).toContain(result.winner);
189    expect(result).toHaveProperty('confidence');
190    expect(result.confidence).toBeGreaterThanOrEqual(0);
191    expect(result.confidence).toBeLessThanOrEqual(1);
192    expect(result).toHaveProperty('comparison');
193    expect(result).toHaveProperty('metadata');
194  }, 60000);
195 
196  it('should validate GenerateRubric output structure', async () => {
197    const result = await agent.generateRubric({
198      criterionName: 'Test',
199      criterionDescription: 'Test criterion',
200      scale: '1-5',
201      includeExamples: false,
202      strictness: 'balanced'
203    });
204 
205    expect(result).toHaveProperty('success');
206    expect(result).toHaveProperty('criterion');
207    expect(result).toHaveProperty('scale');
208    expect(result).toHaveProperty('levels');
209    expect(result).toHaveProperty('scoringGuidelines');
210    expect(result).toHaveProperty('edgeCases');
211    expect(result).toHaveProperty('metadata');
212  }, 60000);
213});
214
Preparing the source view

Agent Skills for Context Engineering

examples/llm-as-judge-skills/tests/skills.test.ts