Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts
1import { tool } from 'ai';2import { z } from 'zod';3import { openai } from '@ai-sdk/openai';4import { generateText } from 'ai';5import { config } from '../../config/index.js';67const CriterionSchema = z.object({8name: z.string().describe('Name of the criterion'),9description: z.string().describe('What this criterion measures'),10weight: z.number().min(0).max(1).default(1).describe('Relative importance')11});1213const RubricSchema = z.object({14scale: z.enum(['1-3', '1-5', '1-10']).default('1-5'),15levelDescriptions: z.record(z.string(), z.string()).optional()16});1718export const DirectScoreInputSchema = z.object({19response: z.string().describe('The LLM response to evaluate'),20prompt: z.string().describe('The original prompt that generated the response'),21context: z.string().optional().describe('Additional context'),22criteria: z.array(CriterionSchema).min(1).describe('Evaluation criteria'),23rubric: RubricSchema.optional()24});2526export type DirectScoreInput = z.infer<typeof DirectScoreInputSchema>;2728export const DirectScoreOutputSchema = z.object({29success: z.boolean(),30scores: z.array(z.object({31criterion: z.string(),32score: z.number(),33maxScore: z.number(),34justification: z.string(),35evidence: z.array(z.string()),36improvement: z.string()37})),38overallScore: z.number(),39weightedScore: z.number(),40summary: z.object({41assessment: z.string(),42strengths: z.array(z.string()),43weaknesses: z.array(z.string()),44priorities: z.array(z.string())45}),46metadata: z.object({47evaluationTimeMs: z.number(),48model: z.string(),49criteriaCount: z.number()50})51});5253export type DirectScoreOutput = z.infer<typeof DirectScoreOutputSchema>;5455export async function executeDirectScore(input: DirectScoreInput): Promise<DirectScoreOutput> {56const startTime = Date.now();57const scale = input.rubric?.scale || '1-5';58const maxScore = parseInt(scale.split('-')[1]);5960const systemPrompt = `You are an expert evaluator. Assess the response against each criterion.61For each criterion:621. Find specific evidence in the response632. Score according to the rubric (1-${maxScore} scale)643. Justify your score654. Suggest one improvement6667Be objective and consistent. Base scores on explicit evidence.`;6869const userPrompt = `## Original Prompt70${input.prompt}7172${input.context ? `## Context\n${input.context}\n` : ''}73## Response to Evaluate74${input.response}7576## Criteria77${input.criteria.map((c, i) => `${i + 1}. **${c.name}** (weight: ${c.weight}): ${c.description}`).join('\n')}7879${input.rubric?.levelDescriptions ? `## Rubric\n${Object.entries(input.rubric.levelDescriptions).map(([k, v]) => `- ${k}: ${v}`).join('\n')}` : ''}8081Respond with valid JSON matching this structure:82{83"scores": [84{85"criterion": "criterion name",86"score": number,87"evidence": ["quote or observation 1", "quote 2"],88"justification": "why this score",89"improvement": "specific suggestion"90}91],92"summary": {93"assessment": "overall quality summary",94"strengths": ["strength 1", "strength 2"],95"weaknesses": ["weakness 1"],96"priorities": ["most important improvement"]97}98}`;99100try {101const result = await generateText({102model: openai(config.openai.model),103system: systemPrompt,104prompt: userPrompt,105temperature: 0.3106});107108const parsed = JSON.parse(result.text);109110// Calculate scores111const totalWeight = input.criteria.reduce((sum, c) => sum + c.weight, 0);112const weightedSum = parsed.scores.reduce((sum: number, s: { criterion: string; score: number }) => {113const criterion = input.criteria.find(c => c.name === s.criterion);114return sum + (s.score * (criterion?.weight || 1));115}, 0);116117const overallScore = parsed.scores.reduce((sum: number, s: { score: number }) => sum + s.score, 0) / parsed.scores.length;118const weightedScore = weightedSum / totalWeight;119120return {121success: true,122scores: parsed.scores.map((s: { criterion: string; score: number; evidence?: string[]; justification: string; improvement: string }) => ({123...s,124maxScore,125evidence: s.evidence || []126})),127overallScore: Math.round(overallScore * 100) / 100,128weightedScore: Math.round(weightedScore * 100) / 100,129summary: parsed.summary,130metadata: {131evaluationTimeMs: Date.now() - startTime,132model: config.openai.model,133criteriaCount: input.criteria.length134}135};136} catch (error) {137return {138success: false,139scores: [],140overallScore: 0,141weightedScore: 0,142summary: {143assessment: `Evaluation failed: ${error instanceof Error ? error.message : 'Unknown error'}`,144strengths: [],145weaknesses: [],146priorities: []147},148metadata: {149evaluationTimeMs: Date.now() - startTime,150model: config.openai.model,151criteriaCount: input.criteria.length152}153};154}155}156157export const directScoreTool = tool({158description: `Evaluate a response by scoring it against specific criteria.159Use for objective evaluations like accuracy, completeness, clarity.160Returns structured scores with justifications.`,161parameters: DirectScoreInputSchema,162execute: executeDirectScore163});164165