Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/llm-as-judge-skills/src/tools/evaluation/pairwise-compare.ts
1import { tool } from 'ai';2import { z } from 'zod';3import { openai } from '@ai-sdk/openai';4import { generateText } from 'ai';5import { config } from '../../config/index.js';67export const PairwiseCompareInputSchema = z.object({8responseA: z.string().describe('First response to compare'),9responseB: z.string().describe('Second response to compare'),10prompt: z.string().describe('The original prompt both responses address'),11context: z.string().optional().describe('Additional context'),12criteria: z.array(z.string()).min(1).describe('Comparison criteria'),13allowTie: z.boolean().optional().default(true).describe('Allow tie verdict'),14swapPositions: z.boolean().optional().default(true).describe('Swap positions to reduce bias')15});1617export type PairwiseCompareInput = z.infer<typeof PairwiseCompareInputSchema>;1819export const PairwiseCompareOutputSchema = z.object({20success: z.boolean(),21winner: z.enum(['A', 'B', 'TIE']),22confidence: z.number().min(0).max(1),23comparison: z.array(z.object({24criterion: z.string(),25winner: z.enum(['A', 'B', 'TIE']),26aAssessment: z.string(),27bAssessment: z.string(),28reasoning: z.string()29})),30analysis: z.object({31responseA: z.object({32strengths: z.array(z.string()),33weaknesses: z.array(z.string())34}),35responseB: z.object({36strengths: z.array(z.string()),37weaknesses: z.array(z.string())38})39}),40differentiators: z.array(z.string()),41positionConsistency: z.object({42consistent: z.boolean(),43firstPassWinner: z.enum(['A', 'B', 'TIE']).optional(),44secondPassWinner: z.enum(['A', 'B', 'TIE']).optional()45}).optional(),46metadata: z.object({47evaluationTimeMs: z.number(),48model: z.string(),49positionsSwapped: z.boolean()50})51});5253export type PairwiseCompareOutput = z.infer<typeof PairwiseCompareOutputSchema>;5455async function evaluatePair(56first: string,57second: string,58prompt: string,59criteria: string[],60context?: string,61allowTie: boolean = true62): Promise<{ winner: 'A' | 'B' | 'TIE'; confidence: number; comparison: PairwiseCompareOutput['comparison']; analysis: PairwiseCompareOutput['analysis'] }> {63const systemPrompt = `You are an expert evaluator comparing two AI responses.6465IMPORTANT:66- Do NOT prefer responses because they are longer67- Do NOT prefer responses based on position (first vs second)68- Focus only on quality according to the criteria69- ${allowTie ? 'Ties are acceptable when responses are genuinely equivalent' : 'You must choose a winner'}`;7071const userPrompt = `## Original Prompt72${prompt}7374${context ? `## Context\n${context}\n` : ''}75## Response A76${first}7778## Response B79${second}8081## Criteria to Compare82${criteria.map((c, i) => `${i + 1}. ${c}`).join('\n')}8384First analyze each response independently, then compare them.85Respond with valid JSON:86{87"analysis": {88"responseA": { "strengths": [...], "weaknesses": [...] },89"responseB": { "strengths": [...], "weaknesses": [...] }90},91"comparison": [92{93"criterion": "criterion name",94"winner": "A" | "B" | "TIE",95"aAssessment": "brief assessment of A",96"bAssessment": "brief assessment of B",97"reasoning": "why this winner"98}99],100"result": {101"winner": "A" | "B" | "TIE",102"confidence": 0.0-1.0,103"reasoning": "overall reasoning"104}105}`;106107const result = await generateText({108model: openai(config.openai.model),109system: systemPrompt,110prompt: userPrompt,111temperature: 0.3112});113114const parsed = JSON.parse(result.text);115116return {117winner: parsed.result.winner,118confidence: parsed.result.confidence,119comparison: parsed.comparison,120analysis: parsed.analysis121};122}123124export async function executePairwiseCompare(input: PairwiseCompareInput): Promise<PairwiseCompareOutput> {125const startTime = Date.now();126127try {128if (input.swapPositions) {129// First pass: A first, B second130const pass1 = await evaluatePair(131input.responseA,132input.responseB,133input.prompt,134input.criteria,135input.context,136input.allowTie137);138139// Second pass: B first, A second140const pass2 = await evaluatePair(141input.responseB,142input.responseA,143input.prompt,144input.criteria,145input.context,146input.allowTie147);148149// Map pass2 result back150const pass2WinnerMapped = pass2.winner === 'A' ? 'B' : pass2.winner === 'B' ? 'A' : 'TIE';151const consistent = pass1.winner === pass2WinnerMapped;152153// Determine final winner154let finalWinner: 'A' | 'B' | 'TIE';155let finalConfidence: number;156157if (consistent) {158finalWinner = pass1.winner;159finalConfidence = (pass1.confidence + pass2.confidence) / 2;160} else {161// Inconsistent - return tie with lower confidence162finalWinner = 'TIE';163finalConfidence = 0.5;164}165166// Merge comparisons167const mergedComparison = pass1.comparison.map((c, i) => {168const c2 = pass2.comparison[i];169const c2WinnerMapped = c2.winner === 'A' ? 'B' : c2.winner === 'B' ? 'A' : 'TIE';170return {171...c,172winner: c.winner === c2WinnerMapped ? c.winner : 'TIE' as const173};174});175176// Find differentiators177const differentiators = mergedComparison178.filter(c => c.winner !== 'TIE')179.map(c => `${c.criterion}: ${c.winner === 'A' ? 'Response A' : 'Response B'} wins - ${c.reasoning}`);180181return {182success: true,183winner: finalWinner,184confidence: Math.round(finalConfidence * 100) / 100,185comparison: mergedComparison,186analysis: pass1.analysis,187differentiators,188positionConsistency: {189consistent,190firstPassWinner: pass1.winner,191secondPassWinner: pass2WinnerMapped192},193metadata: {194evaluationTimeMs: Date.now() - startTime,195model: config.openai.model,196positionsSwapped: true197}198};199} else {200// Single pass without swap201const result = await evaluatePair(202input.responseA,203input.responseB,204input.prompt,205input.criteria,206input.context,207input.allowTie208);209210const differentiators = result.comparison211.filter(c => c.winner !== 'TIE')212.map(c => `${c.criterion}: ${c.winner === 'A' ? 'Response A' : 'Response B'} wins - ${c.reasoning}`);213214return {215success: true,216winner: result.winner,217confidence: result.confidence,218comparison: result.comparison,219analysis: result.analysis,220differentiators,221metadata: {222evaluationTimeMs: Date.now() - startTime,223model: config.openai.model,224positionsSwapped: false225}226};227}228} catch (error) {229return {230success: false,231winner: 'TIE',232confidence: 0,233comparison: [],234analysis: {235responseA: { strengths: [], weaknesses: [] },236responseB: { strengths: [], weaknesses: [] }237},238differentiators: [],239metadata: {240evaluationTimeMs: Date.now() - startTime,241model: config.openai.model,242positionsSwapped: input.swapPositions243}244};245}246}247248export const pairwiseCompareTool = tool({249description: `Compare two responses and select the better one.250Use for subjective evaluations like tone, persuasiveness, style.251More reliable than direct scoring for preferences.`,252parameters: PairwiseCompareInputSchema,253execute: executePairwiseCompare254});255256