Source from repo

Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.

muratcankoylanGitHub muratcankoylanSource repo Original GitHub link

Files

241

Skill

n/a

Size

2.6 MB

Entrypoint

SKILL.md

Format

git-repo

Open file

examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code165 linesFree

examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts

1import { tool } from 'ai';
2import { z } from 'zod';
3import { openai } from '@ai-sdk/openai';
4import { generateText } from 'ai';
5import { config } from '../../config/index.js';
6 
7const CriterionSchema = z.object({
8  name: z.string().describe('Name of the criterion'),
9  description: z.string().describe('What this criterion measures'),
10  weight: z.number().min(0).max(1).default(1).describe('Relative importance')
11});
12 
13const RubricSchema = z.object({
14  scale: z.enum(['1-3', '1-5', '1-10']).default('1-5'),
15  levelDescriptions: z.record(z.string(), z.string()).optional()
16});
17 
18export const DirectScoreInputSchema = z.object({
19  response: z.string().describe('The LLM response to evaluate'),
20  prompt: z.string().describe('The original prompt that generated the response'),
21  context: z.string().optional().describe('Additional context'),
22  criteria: z.array(CriterionSchema).min(1).describe('Evaluation criteria'),
23  rubric: RubricSchema.optional()
24});
25 
26export type DirectScoreInput = z.infer<typeof DirectScoreInputSchema>;
27 
28export const DirectScoreOutputSchema = z.object({
29  success: z.boolean(),
30  scores: z.array(z.object({
31    criterion: z.string(),
32    score: z.number(),
33    maxScore: z.number(),
34    justification: z.string(),
35    evidence: z.array(z.string()),
36    improvement: z.string()
37  })),
38  overallScore: z.number(),
39  weightedScore: z.number(),
40  summary: z.object({
41    assessment: z.string(),
42    strengths: z.array(z.string()),
43    weaknesses: z.array(z.string()),
44    priorities: z.array(z.string())
45  }),
46  metadata: z.object({
47    evaluationTimeMs: z.number(),
48    model: z.string(),
49    criteriaCount: z.number()
50  })
51});
52 
53export type DirectScoreOutput = z.infer<typeof DirectScoreOutputSchema>;
54 
55export async function executeDirectScore(input: DirectScoreInput): Promise<DirectScoreOutput> {
56  const startTime = Date.now();
57  const scale = input.rubric?.scale || '1-5';
58  const maxScore = parseInt(scale.split('-')[1]);
59 
60  const systemPrompt = `You are an expert evaluator. Assess the response against each criterion.
61For each criterion:
621. Find specific evidence in the response
632. Score according to the rubric (1-${maxScore} scale)
643. Justify your score
654. Suggest one improvement
66 
67Be objective and consistent. Base scores on explicit evidence.`;
68 
69  const userPrompt = `## Original Prompt
70${input.prompt}
71 
72${input.context ? `## Context\n${input.context}\n` : ''}
73## Response to Evaluate
74${input.response}
75 
76## Criteria
77${input.criteria.map((c, i) => `${i + 1}. **${c.name}** (weight: ${c.weight}): ${c.description}`).join('\n')}
78 
79${input.rubric?.levelDescriptions ? `## Rubric\n${Object.entries(input.rubric.levelDescriptions).map(([k, v]) => `- ${k}: ${v}`).join('\n')}` : ''}
80 
81Respond with valid JSON matching this structure:
82{
83  "scores": [
84    {
85      "criterion": "criterion name",
86      "score": number,
87      "evidence": ["quote or observation 1", "quote 2"],
88      "justification": "why this score",
89      "improvement": "specific suggestion"
90    }
91  ],
92  "summary": {
93    "assessment": "overall quality summary",
94    "strengths": ["strength 1", "strength 2"],
95    "weaknesses": ["weakness 1"],
96    "priorities": ["most important improvement"]
97  }
98}`;
99 
100  try {
101    const result = await generateText({
102      model: openai(config.openai.model),
103      system: systemPrompt,
104      prompt: userPrompt,
105      temperature: 0.3
106    });
107 
108    const parsed = JSON.parse(result.text);
109    
110    // Calculate scores
111    const totalWeight = input.criteria.reduce((sum, c) => sum + c.weight, 0);
112    const weightedSum = parsed.scores.reduce((sum: number, s: { criterion: string; score: number }) => {
113      const criterion = input.criteria.find(c => c.name === s.criterion);
114      return sum + (s.score * (criterion?.weight || 1));
115    }, 0);
116    
117    const overallScore = parsed.scores.reduce((sum: number, s: { score: number }) => sum + s.score, 0) / parsed.scores.length;
118    const weightedScore = weightedSum / totalWeight;
119 
120    return {
121      success: true,
122      scores: parsed.scores.map((s: { criterion: string; score: number; evidence?: string[]; justification: string; improvement: string }) => ({
123        ...s,
124        maxScore,
125        evidence: s.evidence || []
126      })),
127      overallScore: Math.round(overallScore * 100) / 100,
128      weightedScore: Math.round(weightedScore * 100) / 100,
129      summary: parsed.summary,
130      metadata: {
131        evaluationTimeMs: Date.now() - startTime,
132        model: config.openai.model,
133        criteriaCount: input.criteria.length
134      }
135    };
136  } catch (error) {
137    return {
138      success: false,
139      scores: [],
140      overallScore: 0,
141      weightedScore: 0,
142      summary: {
143        assessment: `Evaluation failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
144        strengths: [],
145        weaknesses: [],
146        priorities: []
147      },
148      metadata: {
149        evaluationTimeMs: Date.now() - startTime,
150        model: config.openai.model,
151        criteriaCount: input.criteria.length
152      }
153    };
154  }
155}
156 
157export const directScoreTool = tool({
158  description: `Evaluate a response by scoring it against specific criteria.
159Use for objective evaluations like accuracy, completeness, clarity.
160Returns structured scores with justifications.`,
161  parameters: DirectScoreInputSchema,
162  execute: executeDirectScore
163});
164 
165

Preparing the source view

Agent Skills for Context Engineering

examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts