Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
skills/evaluation/references/metrics.md
1# Evaluation Reference: Metrics and Implementation23This document provides implementation details for evaluation metrics and evaluation systems.45## Core Metric Definitions67### Factual Accuracy89Factual accuracy measures whether claims in agent output match ground truth.1011```12Excellent (1.0): All claims verified against ground truth, no errors13Good (0.8): Minor errors that do not affect main conclusions14Acceptable (0.6): Major claims correct, minor inaccuracies present15Poor (0.3): Significant factual errors in key claims16Failed (0.0): Fundamental factual errors that invalidate output17```1819Calculation approach:20- Extract claims from output21- Verify each claim against ground truth22- Weight claims by importance (major claims more weight)23- Calculate weighted average of claim accuracy2425### Completeness2627Completeness measures whether output covers all requested aspects.2829```30Excellent (1.0): All requested aspects thoroughly covered31Good (0.8): Most aspects covered with minor gaps32Acceptable (0.6): Key aspects covered, some gaps33Poor (0.3): Major aspects missing from output34Failed (0.0): Fundamental aspects not addressed35```3637### Citation Accuracy3839Citation accuracy measures whether cited sources match claimed sources.4041```42Excellent (1.0): All citations accurate and complete43Good (0.8): Minor citation formatting issues44Acceptable (0.6): Major citations accurate45Poor (0.3): Significant citation problems46Failed (0.0): Citations missing or completely incorrect47```4849### Source Quality5051Source quality measures whether appropriate primary sources were used.5253```54Excellent (1.0): Primary authoritative sources55Good (0.8): Mostly primary sources with some secondary56Acceptable (0.6): Mix of primary and secondary sources57Poor (0.3): Mostly secondary or unreliable sources58Failed (0.0): No credible sources cited59```6061### Tool Efficiency6263Tool efficiency measures whether the agent used appropriate tools a reasonable number of times.6465```66Excellent (1.0): Optimal tool selection and call count67Good (0.8): Good tool selection with minor inefficiencies68Acceptable (0.6): Appropriate tools with some redundancy69Poor (0.3): Wrong tools or excessive call counts70Failed (0.0): Severe tool misuse or extremely excessive calls71```7273## Rubric Implementation7475```python76EVALUATION_DIMENSIONS = {77"factual_accuracy": {78"weight": 0.30,79"description": "Claims match ground truth",80"levels": {81"excellent": 1.0,82"good": 0.8,83"acceptable": 0.6,84"poor": 0.3,85"failed": 0.086}87},88"completeness": {89"weight": 0.25,90"description": "All requested aspects covered",91"levels": {92"excellent": 1.0,93"good": 0.8,94"acceptable": 0.6,95"poor": 0.3,96"failed": 0.097}98},99"citation_accuracy": {100"weight": 0.15,101"description": "Citations match sources",102"levels": {103"excellent": 1.0,104"good": 0.8,105"acceptable": 0.6,106"poor": 0.3,107"failed": 0.0108}109},110"source_quality": {111"weight": 0.10,112"description": "Appropriate primary sources used",113"levels": {114"excellent": 1.0,115"good": 0.8,116"acceptable": 0.6,117"poor": 0.3,118"failed": 0.0119}120},121"tool_efficiency": {122"weight": 0.20,123"description": "Right tools used reasonably",124"levels": {125"excellent": 1.0,126"good": 0.8,127"acceptable": 0.6,128"poor": 0.3,129"failed": 0.0130}131}132}133134def calculate_overall_score(dimension_scores, rubric):135"""Calculate weighted overall score from dimension scores."""136total_weight = 0137weighted_sum = 0138139for dimension, score in dimension_scores.items():140if dimension in rubric:141weight = rubric[dimension]["weight"]142weighted_sum += score * weight143total_weight += weight144145return weighted_sum / total_weight if total_weight > 0 else 0146```147148## Test Set Management149150```python151class TestSet:152def __init__(self, name):153self.name = name154self.tests = []155self.tags = {}156157def add_test(self, test_case):158"""Add test case to test set."""159self.tests.append(test_case)160161# Index by tags162for tag in test_case.get("tags", []):163if tag not in self.tags:164self.tags[tag] = []165self.tags[tag].append(len(self.tests) - 1)166167def filter(self, **criteria):168"""Filter tests by criteria."""169filtered = []170for test in self.tests:171match = True172for key, value in criteria.items():173if test.get(key) != value:174match = False175break176if match:177filtered.append(test)178return filtered179180def get_complexity_distribution(self):181"""Get distribution of tests by complexity."""182distribution = {}183for test in self.tests:184complexity = test.get("complexity", "medium")185distribution[complexity] = distribution.get(complexity, 0) + 1186return distribution187```188189## Evaluation Runner190191```python192class EvaluationRunner:193def __init__(self, test_set, rubric, agent):194self.test_set = test_set195self.rubric = rubric196self.agent = agent197self.results = []198199def run_all(self, verbose=False):200"""Run evaluation on all tests."""201self.results = []202203for i, test in enumerate(self.test_set.tests):204if verbose:205print(f"Running test {i+1}/{len(self.test_set.tests)}")206207result = self.run_test(test)208self.results.append(result)209210return self.summarize()211212def run_test(self, test):213"""Run single evaluation test."""214# Get agent output215output = self.agent.run(test["input"])216217# Evaluate218evaluation = self.evaluate_output(output, test)219220return {221"test": test,222"output": output,223"evaluation": evaluation224}225226def evaluate_output(self, output, test):227"""Evaluate agent output against test."""228ground_truth = test.get("expected", {})229230dimension_scores = {}231for dimension, config in self.rubric.items():232score = self.evaluate_dimension(233output, ground_truth, dimension, config234)235dimension_scores[dimension] = score236237overall = calculate_overall_score(dimension_scores, self.rubric)238239return {240"overall_score": overall,241"dimension_scores": dimension_scores,242"passed": overall >= 0.7243}244245def summarize(self):246"""Summarize evaluation results."""247if not self.results:248return {"error": "No results"}249250passed = sum(1 for r in self.results if r["evaluation"]["passed"])251252dimension_totals = {}253for dimension in self.rubric.keys():254dimension_totals[dimension] = {255"total": 0,256"count": 0257}258259for result in self.results:260for dimension, score in result["evaluation"]["dimension_scores"].items():261if dimension in dimension_totals:262dimension_totals[dimension]["total"] += score263dimension_totals[dimension]["count"] += 1264265dimension_averages = {}266for dimension, data in dimension_totals.items():267if data["count"] > 0:268dimension_averages[dimension] = data["total"] / data["count"]269270return {271"total_tests": len(self.results),272"passed": passed,273"failed": len(self.results) - passed,274"pass_rate": passed / len(self.results) if self.results else 0,275"dimension_averages": dimension_averages,276"failures": [277r for r in self.results278if not r["evaluation"]["passed"]279]280}281```282283## Production Monitoring284285```python286class ProductionMonitor:287def __init__(self, sample_rate=0.01):288self.sample_rate = sample_rate289self.samples = []290self.alert_thresholds = {291"pass_rate_warning": 0.85,292"pass_rate_critical": 0.70293}294295def sample_and_evaluate(self, query, output):296"""Sample production interaction for evaluation."""297if random.random() > self.sample_rate:298return None299300evaluation = evaluate_output(output, {}, EVALUATION_RUBRIC)301302sample = {303"query": query[:200],304"output_preview": output[:200],305"score": evaluation["overall_score"],306"passed": evaluation["passed"],307"timestamp": current_timestamp()308}309310self.samples.append(sample)311return sample312313def get_metrics(self):314"""Calculate current metrics from samples."""315if not self.samples:316return {"status": "insufficient_data"}317318passed = sum(1 for s in self.samples if s["passed"])319pass_rate = passed / len(self.samples)320321avg_score = sum(s["score"] for s in self.samples) / len(self.samples)322323return {324"sample_count": len(self.samples),325"pass_rate": pass_rate,326"average_score": avg_score,327"status": self._get_status(pass_rate)328}329330def _get_status(self, pass_rate):331"""Get status based on pass rate."""332if pass_rate < self.alert_thresholds["pass_rate_critical"]:333return "critical"334elif pass_rate < self.alert_thresholds["pass_rate_warning"]:335return "warning"336else:337return "healthy"338```339340