Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
241
Skill
n/a
Size
2.6 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
skills/evaluation/scripts/evaluator.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code628 linesFree
skills/evaluation/scripts/evaluator.py
1"""Agent Evaluation Framework for context-engineered agent systems.
2 
3Use when: building evaluation pipelines, scoring agent outputs against
4multi-dimensional rubrics, managing test sets, or monitoring production
5agent quality. Provides composable classes that can be used independently
6or wired together into a full evaluation pipeline.
7 
8Typical usage::
9 
10    evaluator = AgentEvaluator()
11    test_set = TestSet("my_tests").create_standard_tests()
12    runner = EvaluationRunner(evaluator, test_set)
13    summary = runner.run_all(verbose=True)
14    print(summary)
15"""
16 
17from typing import Dict, List, Any, Optional
18from dataclasses import dataclass
19from enum import Enum
20import time
21 
22__all__ = [
23    "ScoreLevel",
24    "RubricDimension",
25    "DEFAULT_RUBRIC",
26    "AgentEvaluator",
27    "TestSet",
28    "EvaluationRunner",
29    "ProductionMonitor",
30]
31 
32 
33class ScoreLevel(Enum):
34    """Use when: mapping qualitative judgments to numeric scores."""
35 
36    EXCELLENT = 1.0
37    GOOD = 0.8
38    ACCEPTABLE = 0.6
39    POOR = 0.3
40    FAILED = 0.0
41 
42 
43@dataclass
44class RubricDimension:
45    """Definition of a single evaluation dimension.
46 
47    Use when: defining custom rubric dimensions beyond the defaults.
48    """
49 
50    name: str
51    weight: float
52    description: str
53    levels: Dict[str, str]  # level_name -> description
54 
55 
56DEFAULT_RUBRIC: Dict[str, RubricDimension] = {
57    "factual_accuracy": RubricDimension(
58        name="factual_accuracy",
59        weight=0.30,
60        description="Claims in output match ground truth",
61        levels={
62            "excellent": "All claims verified, no errors",
63            "good": "Minor errors not affecting main conclusions",
64            "acceptable": "Major claims correct, minor inaccuracies",
65            "poor": "Significant factual errors",
66            "failed": "Fundamental factual errors",
67        },
68    ),
69    "completeness": RubricDimension(
70        name="completeness",
71        weight=0.25,
72        description="Output covers all requested aspects",
73        levels={
74            "excellent": "All aspects thoroughly covered",
75            "good": "Most aspects covered, minor gaps",
76            "acceptable": "Key aspects covered, some gaps",
77            "poor": "Major aspects missing",
78            "failed": "Fundamental aspects missing",
79        },
80    ),
81    "citation_accuracy": RubricDimension(
82        name="citation_accuracy",
83        weight=0.15,
84        description="Citations match claimed sources",
85        levels={
86            "excellent": "All citations accurate and complete",
87            "good": "Minor citation issues",
88            "acceptable": "Major citations accurate",
89            "poor": "Significant citation problems",
90            "failed": "Citations missing or incorrect",
91        },
92    ),
93    "source_quality": RubricDimension(
94        name="source_quality",
95        weight=0.10,
96        description="Uses appropriate primary sources",
97        levels={
98            "excellent": "Primary sources, authoritative",
99            "good": "Mostly primary, some secondary",
100            "acceptable": "Mix of primary and secondary",
101            "poor": "Mostly secondary or unreliable",
102            "failed": "No credible sources",
103        },
104    ),
105    "tool_efficiency": RubricDimension(
106        name="tool_efficiency",
107        weight=0.20,
108        description="Uses right tools reasonable number of times",
109        levels={
110            "excellent": "Optimal tool selection and count",
111            "good": "Good tool selection, minor inefficiencies",
112            "acceptable": "Appropriate tools, some redundancy",
113            "poor": "Wrong tools or excessive calls",
114            "failed": "Severe tool misuse",
115        },
116    ),
117}
118 
119 
120# ---------------------------------------------------------------------------
121# Evaluation Engine
122# ---------------------------------------------------------------------------
123 
124 
125class AgentEvaluator:
126    """Main evaluation engine for agent outputs.
127 
128    Use when: scoring a single agent output against a multi-dimensional rubric.
129    Instantiate with a custom rubric or rely on ``DEFAULT_RUBRIC``.
130    """
131 
132    def __init__(self, rubric: Optional[Dict[str, RubricDimension]] = None) -> None:
133        self.rubric: Dict[str, RubricDimension] = rubric or DEFAULT_RUBRIC
134        self.evaluation_history: List[Dict[str, Any]] = []
135 
136    def evaluate(
137        self,
138        task: Dict[str, Any],
139        output: str,
140        ground_truth: Optional[Dict[str, Any]] = None,
141        tool_calls: Optional[List[Dict[str, Any]]] = None,
142    ) -> Dict[str, Any]:
143        """Evaluate agent output against task requirements.
144 
145        Use when: you have a single (task, output) pair and need per-dimension
146        scores plus an overall pass/fail verdict.
147 
148        Returns evaluation results with per-dimension scores.
149        """
150        scores: Dict[str, Dict[str, Any]] = {}
151 
152        for dimension_name, dimension in self.rubric.items():
153            score = self._evaluate_dimension(
154                dimension=dimension,
155                task=task,
156                output=output,
157                ground_truth=ground_truth,
158                tool_calls=tool_calls,
159            )
160 
161            scores[dimension_name] = {
162                "score": score,
163                "weight": dimension.weight,
164                "level": self._score_to_level(score),
165            }
166 
167        # Calculate weighted overall
168        overall: float = sum(
169            s["score"] * self.rubric[k].weight for k, s in scores.items()
170        )
171 
172        result: Dict[str, Any] = {
173            "overall_score": overall,
174            "dimension_scores": scores,
175            "passed": overall >= 0.7,
176            "timestamp": time.time(),
177        }
178 
179        self.evaluation_history.append(result)
180        return result
181 
182    def _evaluate_dimension(
183        self,
184        dimension: RubricDimension,
185        task: Dict[str, Any],
186        output: str,
187        ground_truth: Optional[Dict[str, Any]] = None,
188        tool_calls: Optional[List[Dict[str, Any]]] = None,
189    ) -> float:
190        """Evaluate a single dimension.
191 
192        Use when: extending the evaluator with custom dimension logic.
193        In production, replace heuristics with LLM judgment or human evaluation.
194        """
195        output_lower: str = output.lower()
196        task_type: str = task.get("type", "")
197 
198        if dimension.name == "factual_accuracy":
199            if ground_truth:
200                return self._check_factual_accuracy(output, ground_truth)
201            return 0.7  # Default assumption
202 
203        elif dimension.name == "completeness":
204            required: List[str] = task.get("requirements", [])
205            if required:
206                covered = sum(1 for r in required if r.lower() in output_lower)
207                return covered / len(required)
208            return 0.8
209 
210        elif dimension.name == "citation_accuracy":
211            if task.get("requires_citations"):
212                # Look for citation patterns like [1], [Author 2024], [source]
213                # Avoid false positives from code brackets or JSON
214                citation_pattern = r'\[\d+\]|\[[A-Z][a-z]+(?:\s+(?:et al\.?|&)\s+[A-Z][a-z]+)?\s*[\d,]+\]|\[(?:source|ref|cite)[^\]]*\]'
215                import re as _re
216                citations_found = _re.findall(citation_pattern, output)
217                if len(citations_found) >= 1:
218                    return 1.0
219                elif any(marker in output_lower for marker in ["according to", "cited in", "reported by"]):
220                    return 0.7
221                return 0.4
222            return 0.8  # Citations not required
223 
224        elif dimension.name == "source_quality":
225            quality_markers = ["according to", "reported by", "data from", "study"]
226            quality_count = sum(1 for m in quality_markers if m in output_lower)
227            return min(1.0, 0.5 + quality_count * 0.1)
228 
229        elif dimension.name == "tool_efficiency":
230            if tool_calls:
231                expected_count = self._estimate_expected_tools(task_type)
232                actual_count = len(tool_calls)
233                if actual_count <= expected_count:
234                    return 1.0
235                elif actual_count <= expected_count * 1.5:
236                    return 0.7
237                else:
238                    return 0.4
239            return 0.8  # No tool calls needed or recorded
240 
241        return 0.5  # Default
242 
243    def _check_factual_accuracy(
244        self, output: str, ground_truth: Dict[str, Any]
245    ) -> float:
246        """Check output against ground truth.
247 
248        Use when: ground truth key_claims are available for comparison.
249        """
250        if not ground_truth:
251            return 0.7
252 
253        key_claims: List[str] = ground_truth.get("key_claims", [])
254        if not key_claims:
255            return 0.7
256 
257        output_lower: str = output.lower()
258        matched: int = sum(1 for claim in key_claims if claim.lower() in output_lower)
259 
260        if matched == len(key_claims):
261            return 1.0
262        elif matched >= len(key_claims) * 0.7:
263            return 0.8
264        elif matched >= len(key_claims) * 0.5:
265            return 0.6
266        else:
267            return 0.3
268 
269    def _estimate_expected_tools(self, task_type: str) -> int:
270        """Estimate expected tool count for task type."""
271        estimates: Dict[str, int] = {
272            "research": 3,
273            "create": 2,
274            "analyze": 2,
275            "general": 1,
276        }
277        return estimates.get(task_type, 1)
278 
279    def _score_to_level(self, score: float) -> str:
280        """Convert numeric score to level name."""
281        if score >= 0.9:
282            return "excellent"
283        elif score >= 0.7:
284            return "good"
285        elif score >= 0.5:
286            return "acceptable"
287        elif score >= 0.25:
288            return "poor"
289        else:
290            return "failed"
291 
292 
293# ---------------------------------------------------------------------------
294# Test Set Management
295# ---------------------------------------------------------------------------
296 
297 
298class TestSet:
299    """Manage evaluation test sets with tagging and complexity stratification.
300 
301    Use when: building, filtering, or analyzing collections of evaluation
302    test cases. Supports tag-based indexing and complexity distribution
303    analysis.
304    """
305 
306    def __init__(self, name: str) -> None:
307        self.name: str = name
308        self.tests: List[Dict[str, Any]] = []
309        self.tags: Dict[str, List[int]] = {}
310 
311    def add_test(self, test: Dict[str, Any]) -> None:
312        """Add a test case to the test set.
313 
314        Use when: incrementally building a test set from individual cases.
315        """
316        self.tests.append(test)
317        idx: int = len(self.tests) - 1
318 
319        for tag in test.get("tags", []):
320            if tag not in self.tags:
321                self.tags[tag] = []
322            self.tags[tag].append(idx)
323 
324    def filter(self, **criteria: Any) -> List[Dict[str, Any]]:
325        """Filter tests by criteria.
326 
327        Use when: selecting a subset of tests matching specific field values.
328        """
329        results: List[Dict[str, Any]] = []
330        for test in self.tests:
331            match = True
332            for key, value in criteria.items():
333                if test.get(key) != value:
334                    match = False
335                    break
336            if match:
337                results.append(test)
338        return results
339 
340    def get_complexity_distribution(self) -> Dict[str, int]:
341        """Get distribution of tests by complexity.
342 
343        Use when: verifying test set balance across difficulty levels.
344        """
345        distribution: Dict[str, int] = {}
346        for test in self.tests:
347            complexity: str = test.get("complexity", "medium")
348            distribution[complexity] = distribution.get(complexity, 0) + 1
349        return distribution
350 
351    def create_standard_tests(self) -> "TestSet":
352        """Populate with standard test cases for context engineering evaluation.
353 
354        Use when: bootstrapping a test set quickly for initial development.
355        """
356        tests: List[Dict[str, Any]] = [
357            {
358                "name": "simple_lookup",
359                "input": "What is the capital of France?",
360                "expected": {"type": "fact", "answer": "Paris"},
361                "complexity": "simple",
362                "tags": ["knowledge", "simple"],
363            },
364            {
365                "name": "context_retrieval",
366                "input": "Based on the user preferences, recommend a restaurant",
367                "context": {
368                    "user_preferences": {
369                        "cuisine": "Italian",
370                        "price_range": "moderate",
371                    }
372                },
373                "complexity": "medium",
374                "tags": ["retrieval", "reasoning"],
375            },
376            {
377                "name": "multi_step_reasoning",
378                "input": "Analyze the sales data and create a summary report",
379                "complexity": "complex",
380                "tags": ["analysis", "multi-step"],
381            },
382        ]
383 
384        for test in tests:
385            self.add_test(test)
386 
387        return self
388 
389 
390# ---------------------------------------------------------------------------
391# Evaluation Runner
392# ---------------------------------------------------------------------------
393 
394 
395class EvaluationRunner:
396    """Run evaluations across an entire test set and produce summaries.
397 
398    Use when: executing a full evaluation pass over a test set, comparing
399    agent versions, or generating evaluation reports.
400    """
401 
402    def __init__(self, evaluator: AgentEvaluator, test_set: TestSet) -> None:
403        self.evaluator: AgentEvaluator = evaluator
404        self.test_set: TestSet = test_set
405        self.results: List[Dict[str, Any]] = []
406 
407    def run_all(self, verbose: bool = False) -> Dict[str, Any]:
408        """Run evaluation on all tests in the test set.
409 
410        Use when: performing a complete evaluation pass.
411        """
412        self.results = []
413 
414        for i, test in enumerate(self.test_set.tests):
415            if verbose:
416                print(
417                    f"Running test {i + 1}/{len(self.test_set.tests)}: {test['name']}"
418                )
419 
420            result = self.run_test(test)
421            self.results.append(result)
422 
423        return self.summarize()
424 
425    def run_test(self, test: Dict[str, Any]) -> Dict[str, Any]:
426        """Run a single evaluation test.
427 
428        Use when: evaluating an individual test case outside of a full run.
429        In production, replace the simulated output with actual agent execution.
430        """
431        # In production, run actual agent
432        # Here we simulate
433        output: str = f"Simulated output for: {test.get('input', '')}"
434 
435        evaluation: Dict[str, Any] = self.evaluator.evaluate(
436            task=test,
437            output=output,
438            ground_truth=test.get("expected"),
439            tool_calls=[],
440        )
441 
442        return {
443            "test": test,
444            "output": output,
445            "evaluation": evaluation,
446            "passed": evaluation["passed"],
447        }
448 
449    def summarize(self) -> Dict[str, Any]:
450        """Summarize evaluation results with per-dimension averages.
451 
452        Use when: generating a report after a full evaluation run.
453        """
454        if not self.results:
455            return {"error": "No results"}
456 
457        passed: int = sum(1 for r in self.results if r["passed"])
458 
459        # Dimension averages
460        dimension_totals: Dict[str, Dict[str, float]] = {}
461        for dim_name in self.evaluator.rubric.keys():
462            dimension_totals[dim_name] = {"total": 0.0, "count": 0.0}
463 
464        for result in self.results:
465            for dim_name, score in result["evaluation"]["dimension_scores"].items():
466                dimension_totals[dim_name]["total"] += score["score"]
467                dimension_totals[dim_name]["count"] += 1
468 
469        dimension_averages: Dict[str, float] = {}
470        for dim_name, data in dimension_totals.items():
471            if data["count"] > 0:
472                dimension_averages[dim_name] = data["total"] / data["count"]
473 
474        return {
475            "total_tests": len(self.results),
476            "passed": passed,
477            "failed": len(self.results) - passed,
478            "pass_rate": passed / len(self.results) if self.results else 0,
479            "dimension_averages": dimension_averages,
480            "failures": [
481                {
482                    "test": r["test"]["name"],
483                    "score": r["evaluation"]["overall_score"],
484                }
485                for r in self.results
486                if not r["passed"]
487            ],
488        }
489 
490 
491# ---------------------------------------------------------------------------
492# Production Monitoring
493# ---------------------------------------------------------------------------
494 
495 
496class ProductionMonitor:
497    """Monitor agent performance in production via sampling.
498 
499    Use when: setting up continuous quality monitoring for a deployed agent.
500    Samples interactions at a configurable rate and tracks pass rate, average
501    score, and alert status.
502    """
503 
504    def __init__(self, sample_rate: float = 0.01) -> None:
505        import random
506 
507        self.sample_rate: float = sample_rate
508        self._rng: random.Random = random.Random()
509        self.samples: List[Dict[str, Any]] = []
510        self.alert_thresholds: Dict[str, float] = {
511            "pass_rate_warning": 0.85,
512            "pass_rate_critical": 0.70,
513        }
514 
515    def should_sample(self) -> bool:
516        """Determine if current interaction should be sampled.
517 
518        Use when: deciding at request time whether to evaluate this interaction.
519        """
520        return self._rng.random() < self.sample_rate
521 
522    def record_sample(
523        self, query: str, output: str, evaluation: Dict[str, Any]
524    ) -> None:
525        """Record a production sample for evaluation.
526 
527        Use when: storing evaluated production interactions for trend analysis.
528        """
529        sample: Dict[str, Any] = {
530            "query": query[:200],
531            "output_preview": output[:200],
532            "score": evaluation.get("overall_score", 0),
533            "passed": evaluation.get("passed", False),
534            "timestamp": time.time(),
535        }
536        self.samples.append(sample)
537 
538    def get_metrics(self) -> Dict[str, Any]:
539        """Calculate current metrics from collected samples.
540 
541        Use when: checking production health or generating monitoring reports.
542        """
543        if not self.samples:
544            return {"status": "insufficient_data"}
545 
546        passed: int = sum(1 for s in self.samples if s["passed"])
547        pass_rate: float = passed / len(self.samples)
548        avg_score: float = sum(s["score"] for s in self.samples) / len(self.samples)
549 
550        status: str = "healthy"
551        if pass_rate < self.alert_thresholds["pass_rate_critical"]:
552            status = "critical"
553        elif pass_rate < self.alert_thresholds["pass_rate_warning"]:
554            status = "warning"
555 
556        return {
557            "sample_count": len(self.samples),
558            "pass_rate": pass_rate,
559            "average_score": avg_score,
560            "status": status,
561            "alerts": self._generate_alerts(pass_rate, avg_score),
562        }
563 
564    def _generate_alerts(
565        self, pass_rate: float, avg_score: float
566    ) -> List[Dict[str, str]]:
567        """Generate alerts based on metrics."""
568        alerts: List[Dict[str, str]] = []
569 
570        if pass_rate < self.alert_thresholds["pass_rate_critical"]:
571            alerts.append(
572                {
573                    "type": "critical",
574                    "message": f"Pass rate ({pass_rate:.2f}) below critical threshold",
575                }
576            )
577        elif pass_rate < self.alert_thresholds["pass_rate_warning"]:
578            alerts.append(
579                {
580                    "type": "warning",
581                    "message": f"Pass rate ({pass_rate:.2f}) below warning threshold",
582                }
583            )
584 
585        if avg_score < 0.6:
586            alerts.append(
587                {
588                    "type": "quality",
589                    "message": f"Average score ({avg_score:.2f}) indicates quality issues",
590                }
591            )
592 
593        return alerts
594 
595 
596# ---------------------------------------------------------------------------
597# CLI entry point
598# ---------------------------------------------------------------------------
599 
600if __name__ == "__main__":
601    print("=== Agent Evaluation Framework Demo ===\n")
602 
603    # 1. Create evaluator with default rubric
604    evaluator = AgentEvaluator()
605    print(f"Rubric dimensions: {list(evaluator.rubric.keys())}\n")
606 
607    # 2. Build a standard test set
608    test_set = TestSet("demo").create_standard_tests()
609    print(f"Test set: {test_set.name}")
610    print(f"Test count: {len(test_set.tests)}")
611    print(f"Complexity distribution: {test_set.get_complexity_distribution()}\n")
612 
613    # 3. Run evaluation
614    runner = EvaluationRunner(evaluator, test_set)
615    summary = runner.run_all(verbose=True)
616 
617    print(f"\n--- Summary ---")
618    print(f"Total: {summary['total_tests']}")
619    print(f"Passed: {summary['passed']}")
620    print(f"Failed: {summary['failed']}")
621    print(f"Pass rate: {summary['pass_rate']:.1%}")
622    print(f"Dimension averages: {summary['dimension_averages']}")
623 
624    if summary["failures"]:
625        print(f"\nFailures:")
626        for f in summary["failures"]:
627            print(f"  - {f['test']}: {f['score']:.2f}")
628
Preparing the source view

Agent Skills for Context Engineering

skills/evaluation/scripts/evaluator.py