Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
skills/evaluation/scripts/evaluator.py
1"""Agent Evaluation Framework for context-engineered agent systems.23Use when: building evaluation pipelines, scoring agent outputs against4multi-dimensional rubrics, managing test sets, or monitoring production5agent quality. Provides composable classes that can be used independently6or wired together into a full evaluation pipeline.78Typical usage::910evaluator = AgentEvaluator()11test_set = TestSet("my_tests").create_standard_tests()12runner = EvaluationRunner(evaluator, test_set)13summary = runner.run_all(verbose=True)14print(summary)15"""1617from typing import Dict, List, Any, Optional18from dataclasses import dataclass19from enum import Enum20import time2122__all__ = [23"ScoreLevel",24"RubricDimension",25"DEFAULT_RUBRIC",26"AgentEvaluator",27"TestSet",28"EvaluationRunner",29"ProductionMonitor",30]313233class ScoreLevel(Enum):34"""Use when: mapping qualitative judgments to numeric scores."""3536EXCELLENT = 1.037GOOD = 0.838ACCEPTABLE = 0.639POOR = 0.340FAILED = 0.0414243@dataclass44class RubricDimension:45"""Definition of a single evaluation dimension.4647Use when: defining custom rubric dimensions beyond the defaults.48"""4950name: str51weight: float52description: str53levels: Dict[str, str] # level_name -> description545556DEFAULT_RUBRIC: Dict[str, RubricDimension] = {57"factual_accuracy": RubricDimension(58name="factual_accuracy",59weight=0.30,60description="Claims in output match ground truth",61levels={62"excellent": "All claims verified, no errors",63"good": "Minor errors not affecting main conclusions",64"acceptable": "Major claims correct, minor inaccuracies",65"poor": "Significant factual errors",66"failed": "Fundamental factual errors",67},68),69"completeness": RubricDimension(70name="completeness",71weight=0.25,72description="Output covers all requested aspects",73levels={74"excellent": "All aspects thoroughly covered",75"good": "Most aspects covered, minor gaps",76"acceptable": "Key aspects covered, some gaps",77"poor": "Major aspects missing",78"failed": "Fundamental aspects missing",79},80),81"citation_accuracy": RubricDimension(82name="citation_accuracy",83weight=0.15,84description="Citations match claimed sources",85levels={86"excellent": "All citations accurate and complete",87"good": "Minor citation issues",88"acceptable": "Major citations accurate",89"poor": "Significant citation problems",90"failed": "Citations missing or incorrect",91},92),93"source_quality": RubricDimension(94name="source_quality",95weight=0.10,96description="Uses appropriate primary sources",97levels={98"excellent": "Primary sources, authoritative",99"good": "Mostly primary, some secondary",100"acceptable": "Mix of primary and secondary",101"poor": "Mostly secondary or unreliable",102"failed": "No credible sources",103},104),105"tool_efficiency": RubricDimension(106name="tool_efficiency",107weight=0.20,108description="Uses right tools reasonable number of times",109levels={110"excellent": "Optimal tool selection and count",111"good": "Good tool selection, minor inefficiencies",112"acceptable": "Appropriate tools, some redundancy",113"poor": "Wrong tools or excessive calls",114"failed": "Severe tool misuse",115},116),117}118119120# ---------------------------------------------------------------------------121# Evaluation Engine122# ---------------------------------------------------------------------------123124125class AgentEvaluator:126"""Main evaluation engine for agent outputs.127128Use when: scoring a single agent output against a multi-dimensional rubric.129Instantiate with a custom rubric or rely on ``DEFAULT_RUBRIC``.130"""131132def __init__(self, rubric: Optional[Dict[str, RubricDimension]] = None) -> None:133self.rubric: Dict[str, RubricDimension] = rubric or DEFAULT_RUBRIC134self.evaluation_history: List[Dict[str, Any]] = []135136def evaluate(137self,138task: Dict[str, Any],139output: str,140ground_truth: Optional[Dict[str, Any]] = None,141tool_calls: Optional[List[Dict[str, Any]]] = None,142) -> Dict[str, Any]:143"""Evaluate agent output against task requirements.144145Use when: you have a single (task, output) pair and need per-dimension146scores plus an overall pass/fail verdict.147148Returns evaluation results with per-dimension scores.149"""150scores: Dict[str, Dict[str, Any]] = {}151152for dimension_name, dimension in self.rubric.items():153score = self._evaluate_dimension(154dimension=dimension,155task=task,156output=output,157ground_truth=ground_truth,158tool_calls=tool_calls,159)160161scores[dimension_name] = {162"score": score,163"weight": dimension.weight,164"level": self._score_to_level(score),165}166167# Calculate weighted overall168overall: float = sum(169s["score"] * self.rubric[k].weight for k, s in scores.items()170)171172result: Dict[str, Any] = {173"overall_score": overall,174"dimension_scores": scores,175"passed": overall >= 0.7,176"timestamp": time.time(),177}178179self.evaluation_history.append(result)180return result181182def _evaluate_dimension(183self,184dimension: RubricDimension,185task: Dict[str, Any],186output: str,187ground_truth: Optional[Dict[str, Any]] = None,188tool_calls: Optional[List[Dict[str, Any]]] = None,189) -> float:190"""Evaluate a single dimension.191192Use when: extending the evaluator with custom dimension logic.193In production, replace heuristics with LLM judgment or human evaluation.194"""195output_lower: str = output.lower()196task_type: str = task.get("type", "")197198if dimension.name == "factual_accuracy":199if ground_truth:200return self._check_factual_accuracy(output, ground_truth)201return 0.7 # Default assumption202203elif dimension.name == "completeness":204required: List[str] = task.get("requirements", [])205if required:206covered = sum(1 for r in required if r.lower() in output_lower)207return covered / len(required)208return 0.8209210elif dimension.name == "citation_accuracy":211if task.get("requires_citations"):212# Look for citation patterns like [1], [Author 2024], [source]213# Avoid false positives from code brackets or JSON214citation_pattern = r'\[\d+\]|\[[A-Z][a-z]+(?:\s+(?:et al\.?|&)\s+[A-Z][a-z]+)?\s*[\d,]+\]|\[(?:source|ref|cite)[^\]]*\]'215import re as _re216citations_found = _re.findall(citation_pattern, output)217if len(citations_found) >= 1:218return 1.0219elif any(marker in output_lower for marker in ["according to", "cited in", "reported by"]):220return 0.7221return 0.4222return 0.8 # Citations not required223224elif dimension.name == "source_quality":225quality_markers = ["according to", "reported by", "data from", "study"]226quality_count = sum(1 for m in quality_markers if m in output_lower)227return min(1.0, 0.5 + quality_count * 0.1)228229elif dimension.name == "tool_efficiency":230if tool_calls:231expected_count = self._estimate_expected_tools(task_type)232actual_count = len(tool_calls)233if actual_count <= expected_count:234return 1.0235elif actual_count <= expected_count * 1.5:236return 0.7237else:238return 0.4239return 0.8 # No tool calls needed or recorded240241return 0.5 # Default242243def _check_factual_accuracy(244self, output: str, ground_truth: Dict[str, Any]245) -> float:246"""Check output against ground truth.247248Use when: ground truth key_claims are available for comparison.249"""250if not ground_truth:251return 0.7252253key_claims: List[str] = ground_truth.get("key_claims", [])254if not key_claims:255return 0.7256257output_lower: str = output.lower()258matched: int = sum(1 for claim in key_claims if claim.lower() in output_lower)259260if matched == len(key_claims):261return 1.0262elif matched >= len(key_claims) * 0.7:263return 0.8264elif matched >= len(key_claims) * 0.5:265return 0.6266else:267return 0.3268269def _estimate_expected_tools(self, task_type: str) -> int:270"""Estimate expected tool count for task type."""271estimates: Dict[str, int] = {272"research": 3,273"create": 2,274"analyze": 2,275"general": 1,276}277return estimates.get(task_type, 1)278279def _score_to_level(self, score: float) -> str:280"""Convert numeric score to level name."""281if score >= 0.9:282return "excellent"283elif score >= 0.7:284return "good"285elif score >= 0.5:286return "acceptable"287elif score >= 0.25:288return "poor"289else:290return "failed"291292293# ---------------------------------------------------------------------------294# Test Set Management295# ---------------------------------------------------------------------------296297298class TestSet:299"""Manage evaluation test sets with tagging and complexity stratification.300301Use when: building, filtering, or analyzing collections of evaluation302test cases. Supports tag-based indexing and complexity distribution303analysis.304"""305306def __init__(self, name: str) -> None:307self.name: str = name308self.tests: List[Dict[str, Any]] = []309self.tags: Dict[str, List[int]] = {}310311def add_test(self, test: Dict[str, Any]) -> None:312"""Add a test case to the test set.313314Use when: incrementally building a test set from individual cases.315"""316self.tests.append(test)317idx: int = len(self.tests) - 1318319for tag in test.get("tags", []):320if tag not in self.tags:321self.tags[tag] = []322self.tags[tag].append(idx)323324def filter(self, **criteria: Any) -> List[Dict[str, Any]]:325"""Filter tests by criteria.326327Use when: selecting a subset of tests matching specific field values.328"""329results: List[Dict[str, Any]] = []330for test in self.tests:331match = True332for key, value in criteria.items():333if test.get(key) != value:334match = False335break336if match:337results.append(test)338return results339340def get_complexity_distribution(self) -> Dict[str, int]:341"""Get distribution of tests by complexity.342343Use when: verifying test set balance across difficulty levels.344"""345distribution: Dict[str, int] = {}346for test in self.tests:347complexity: str = test.get("complexity", "medium")348distribution[complexity] = distribution.get(complexity, 0) + 1349return distribution350351def create_standard_tests(self) -> "TestSet":352"""Populate with standard test cases for context engineering evaluation.353354Use when: bootstrapping a test set quickly for initial development.355"""356tests: List[Dict[str, Any]] = [357{358"name": "simple_lookup",359"input": "What is the capital of France?",360"expected": {"type": "fact", "answer": "Paris"},361"complexity": "simple",362"tags": ["knowledge", "simple"],363},364{365"name": "context_retrieval",366"input": "Based on the user preferences, recommend a restaurant",367"context": {368"user_preferences": {369"cuisine": "Italian",370"price_range": "moderate",371}372},373"complexity": "medium",374"tags": ["retrieval", "reasoning"],375},376{377"name": "multi_step_reasoning",378"input": "Analyze the sales data and create a summary report",379"complexity": "complex",380"tags": ["analysis", "multi-step"],381},382]383384for test in tests:385self.add_test(test)386387return self388389390# ---------------------------------------------------------------------------391# Evaluation Runner392# ---------------------------------------------------------------------------393394395class EvaluationRunner:396"""Run evaluations across an entire test set and produce summaries.397398Use when: executing a full evaluation pass over a test set, comparing399agent versions, or generating evaluation reports.400"""401402def __init__(self, evaluator: AgentEvaluator, test_set: TestSet) -> None:403self.evaluator: AgentEvaluator = evaluator404self.test_set: TestSet = test_set405self.results: List[Dict[str, Any]] = []406407def run_all(self, verbose: bool = False) -> Dict[str, Any]:408"""Run evaluation on all tests in the test set.409410Use when: performing a complete evaluation pass.411"""412self.results = []413414for i, test in enumerate(self.test_set.tests):415if verbose:416print(417f"Running test {i + 1}/{len(self.test_set.tests)}: {test['name']}"418)419420result = self.run_test(test)421self.results.append(result)422423return self.summarize()424425def run_test(self, test: Dict[str, Any]) -> Dict[str, Any]:426"""Run a single evaluation test.427428Use when: evaluating an individual test case outside of a full run.429In production, replace the simulated output with actual agent execution.430"""431# In production, run actual agent432# Here we simulate433output: str = f"Simulated output for: {test.get('input', '')}"434435evaluation: Dict[str, Any] = self.evaluator.evaluate(436task=test,437output=output,438ground_truth=test.get("expected"),439tool_calls=[],440)441442return {443"test": test,444"output": output,445"evaluation": evaluation,446"passed": evaluation["passed"],447}448449def summarize(self) -> Dict[str, Any]:450"""Summarize evaluation results with per-dimension averages.451452Use when: generating a report after a full evaluation run.453"""454if not self.results:455return {"error": "No results"}456457passed: int = sum(1 for r in self.results if r["passed"])458459# Dimension averages460dimension_totals: Dict[str, Dict[str, float]] = {}461for dim_name in self.evaluator.rubric.keys():462dimension_totals[dim_name] = {"total": 0.0, "count": 0.0}463464for result in self.results:465for dim_name, score in result["evaluation"]["dimension_scores"].items():466dimension_totals[dim_name]["total"] += score["score"]467dimension_totals[dim_name]["count"] += 1468469dimension_averages: Dict[str, float] = {}470for dim_name, data in dimension_totals.items():471if data["count"] > 0:472dimension_averages[dim_name] = data["total"] / data["count"]473474return {475"total_tests": len(self.results),476"passed": passed,477"failed": len(self.results) - passed,478"pass_rate": passed / len(self.results) if self.results else 0,479"dimension_averages": dimension_averages,480"failures": [481{482"test": r["test"]["name"],483"score": r["evaluation"]["overall_score"],484}485for r in self.results486if not r["passed"]487],488}489490491# ---------------------------------------------------------------------------492# Production Monitoring493# ---------------------------------------------------------------------------494495496class ProductionMonitor:497"""Monitor agent performance in production via sampling.498499Use when: setting up continuous quality monitoring for a deployed agent.500Samples interactions at a configurable rate and tracks pass rate, average501score, and alert status.502"""503504def __init__(self, sample_rate: float = 0.01) -> None:505import random506507self.sample_rate: float = sample_rate508self._rng: random.Random = random.Random()509self.samples: List[Dict[str, Any]] = []510self.alert_thresholds: Dict[str, float] = {511"pass_rate_warning": 0.85,512"pass_rate_critical": 0.70,513}514515def should_sample(self) -> bool:516"""Determine if current interaction should be sampled.517518Use when: deciding at request time whether to evaluate this interaction.519"""520return self._rng.random() < self.sample_rate521522def record_sample(523self, query: str, output: str, evaluation: Dict[str, Any]524) -> None:525"""Record a production sample for evaluation.526527Use when: storing evaluated production interactions for trend analysis.528"""529sample: Dict[str, Any] = {530"query": query[:200],531"output_preview": output[:200],532"score": evaluation.get("overall_score", 0),533"passed": evaluation.get("passed", False),534"timestamp": time.time(),535}536self.samples.append(sample)537538def get_metrics(self) -> Dict[str, Any]:539"""Calculate current metrics from collected samples.540541Use when: checking production health or generating monitoring reports.542"""543if not self.samples:544return {"status": "insufficient_data"}545546passed: int = sum(1 for s in self.samples if s["passed"])547pass_rate: float = passed / len(self.samples)548avg_score: float = sum(s["score"] for s in self.samples) / len(self.samples)549550status: str = "healthy"551if pass_rate < self.alert_thresholds["pass_rate_critical"]:552status = "critical"553elif pass_rate < self.alert_thresholds["pass_rate_warning"]:554status = "warning"555556return {557"sample_count": len(self.samples),558"pass_rate": pass_rate,559"average_score": avg_score,560"status": status,561"alerts": self._generate_alerts(pass_rate, avg_score),562}563564def _generate_alerts(565self, pass_rate: float, avg_score: float566) -> List[Dict[str, str]]:567"""Generate alerts based on metrics."""568alerts: List[Dict[str, str]] = []569570if pass_rate < self.alert_thresholds["pass_rate_critical"]:571alerts.append(572{573"type": "critical",574"message": f"Pass rate ({pass_rate:.2f}) below critical threshold",575}576)577elif pass_rate < self.alert_thresholds["pass_rate_warning"]:578alerts.append(579{580"type": "warning",581"message": f"Pass rate ({pass_rate:.2f}) below warning threshold",582}583)584585if avg_score < 0.6:586alerts.append(587{588"type": "quality",589"message": f"Average score ({avg_score:.2f}) indicates quality issues",590}591)592593return alerts594595596# ---------------------------------------------------------------------------597# CLI entry point598# ---------------------------------------------------------------------------599600if __name__ == "__main__":601print("=== Agent Evaluation Framework Demo ===\n")602603# 1. Create evaluator with default rubric604evaluator = AgentEvaluator()605print(f"Rubric dimensions: {list(evaluator.rubric.keys())}\n")606607# 2. Build a standard test set608test_set = TestSet("demo").create_standard_tests()609print(f"Test set: {test_set.name}")610print(f"Test count: {len(test_set.tests)}")611print(f"Complexity distribution: {test_set.get_complexity_distribution()}\n")612613# 3. Run evaluation614runner = EvaluationRunner(evaluator, test_set)615summary = runner.run_all(verbose=True)616617print(f"\n--- Summary ---")618print(f"Total: {summary['total_tests']}")619print(f"Passed: {summary['passed']}")620print(f"Failed: {summary['failed']}")621print(f"Pass rate: {summary['pass_rate']:.1%}")622print(f"Dimension averages: {summary['dimension_averages']}")623624if summary["failures"]:625print(f"\nFailures:")626for f in summary["failures"]:627print(f" - {f['test']}: {f['score']:.2f}")628