Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
skills/advanced-evaluation/scripts/evaluation_example.py
1"""Advanced Evaluation Example23Use when: building LLM-as-judge evaluation pipelines, comparing model outputs4with position-bias mitigation, or generating domain-specific scoring rubrics.56This module demonstrates the three core evaluation patterns from the7advanced-evaluation skill: direct scoring, pairwise comparison with position8swapping, and rubric generation. All functions use pseudocode-style examples9that work across Python environments without specific dependencies.10"""1112from __future__ import annotations1314from typing import Any1516__all__ = [17"direct_scoring_example",18"pairwise_comparison_example",19"rubric_generation_example",20]212223# =============================================================================24# DIRECT SCORING EXAMPLE25# =============================================================================262728def direct_scoring_example() -> dict[str, Any]:29"""Rate a single response against defined criteria using direct scoring.3031Use when: evaluating objective criteria like factual accuracy, instruction32following, or toxicity where a clear ground truth or rubric exists.3334Returns:35Dictionary containing per-criterion scores, evidence, justifications,36and a weighted summary.37"""3839# Input40prompt: str = "Explain quantum entanglement to a high school student"41response: str = (42"Quantum entanglement is like having two magical coins that are connected. "43"When you flip one and it lands on heads, the other instantly shows tails, "44'no matter how far apart they are. Scientists call this "spooky action at a distance."'45)4647criteria: list[dict[str, Any]] = [48{"name": "Accuracy", "description": "Scientific correctness", "weight": 0.4},49{"name": "Clarity", "description": "Understandable for audience", "weight": 0.3},50{"name": "Engagement", "description": "Interesting and memorable", "weight": 0.3},51]5253# System prompt for the evaluator54system_prompt: str = (55"You are an expert evaluator. Assess the response against each criterion.\n\n"56"For each criterion:\n"57"1. Find specific evidence in the response\n"58"2. Score according to the rubric (1-5 scale)\n"59"3. Justify your score with evidence\n"60"4. Suggest one specific improvement\n\n"61"Be objective and consistent. Base scores on explicit evidence."62)6364# User prompt structure65user_prompt: str = f"""## Original Prompt66{prompt}6768## Response to Evaluate69{response}7071## Criteria721. **Accuracy** (weight: 0.4): Scientific correctness732. **Clarity** (weight: 0.3): Understandable for audience743. **Engagement** (weight: 0.3): Interesting and memorable7576## Output Format77Respond with valid JSON:78{{79"scores": [80{{81"criterion": "Accuracy",82"score": 4,83"evidence": ["quote or observation"],84"justification": "why this score",85"improvement": "specific suggestion"86}}87],88"summary": {{89"assessment": "overall quality summary",90"strengths": ["strength 1"],91"weaknesses": ["weakness 1"]92}}93}}"""9495# Expected output structure96expected_output: dict[str, Any] = {97"scores": [98{99"criterion": "Accuracy",100"score": 4,101"evidence": ["Correctly uses analogy", "Mentions spooky action at a distance"],102"justification": "Core concept is correct, analogy is appropriate",103"improvement": "Could mention it's a quantum mechanical phenomenon",104},105{106"criterion": "Clarity",107"score": 5,108"evidence": ["Simple coin analogy", "No jargon"],109"justification": "Appropriate for high school level",110"improvement": "None needed",111},112{113"criterion": "Engagement",114"score": 4,115"evidence": ["Magical coins", "Spooky action quote"],116"justification": "Memorable imagery and Einstein quote",117"improvement": "Could add a real-world application",118},119],120"summary": {121"assessment": "Good explanation suitable for the target audience",122"strengths": ["Clear analogy", "Age-appropriate language"],123"weaknesses": ["Could be more comprehensive"],124},125}126127# Calculate weighted score128total_weight: float = sum(c["weight"] for c in criteria)129weighted_score: float = sum(130s["score"] * next(c["weight"] for c in criteria if c["name"] == s["criterion"])131for s in expected_output["scores"]132) / total_weight133134print(f"Weighted Score: {weighted_score:.2f}/5")135return expected_output136137138# =============================================================================139# PAIRWISE COMPARISON WITH POSITION BIAS MITIGATION140# =============================================================================141142143def pairwise_comparison_example() -> dict[str, Any]:144"""Compare two responses with position-swapped bias mitigation.145146Use when: evaluating subjective preferences like tone, style, or147persuasiveness where pairwise comparison achieves higher human-judge148agreement than direct scoring.149150Returns:151Dictionary containing the winner, confidence score, and whether152position consistency was achieved across both passes.153"""154155prompt: str = "Explain machine learning to a beginner"156157response_a: str = (158"Machine learning is a subset of artificial intelligence that enables "159"systems to learn and improve from experience without being explicitly "160"programmed. It uses statistical techniques to give computers the ability "161"to identify patterns in data."162)163164response_b: str = (165"Imagine teaching a dog a new trick. You show the dog what to do, give "166"treats when it's right, and eventually it learns. Machine learning works "167"similarly - we show computers lots of examples, tell them when they're "168"right, and they learn to recognize patterns on their own."169)170171criteria: list[str] = ["clarity", "accessibility", "accuracy"]172173# System prompt emphasizing bias awareness174system_prompt: str = (175"You are an expert evaluator comparing two AI responses.\n\n"176"CRITICAL INSTRUCTIONS:\n"177"- Do NOT prefer responses because they are longer\n"178"- Do NOT prefer responses based on position (first vs second)\n"179"- Focus ONLY on quality according to the specified criteria\n"180"- Ties are acceptable when responses are genuinely equivalent"181)182183# Build evaluation prompt for a given ordering184def evaluate_pass(185first_response: str,186second_response: str,187first_label: str,188second_label: str,189) -> str:190"""Build evaluation prompt for one pass of position-swapped comparison.191192Use when: constructing the prompt for a single evaluation pass before193swapping response positions for bias mitigation.194"""195return f"""## Original Prompt196{prompt}197198## Response {first_label}199{first_response}200201## Response {second_label}202{second_response}203204## Comparison Criteria205{', '.join(criteria)}206207## Output Format208{{209"comparison": [210{{"criterion": "clarity", "winner": "A|B|TIE", "reasoning": "..."}}211],212"result": {{213"winner": "A|B|TIE",214"confidence": 0.0-1.0,215"reasoning": "overall reasoning"216}}217}}"""218219# Position bias mitigation protocol220print("Pass 1: A in first position")221pass1_result: dict[str, Any] = {"winner": "B", "confidence": 0.8}222223print("Pass 2: B in first position (swapped)")224pass2_result: dict[str, Any] = {"winner": "A", "confidence": 0.75} # A because B was first225226# Map pass2 result back (swap labels)227def map_winner(winner: str) -> str:228"""Map winner label after position swap."""229return {"A": "B", "B": "A", "TIE": "TIE"}[winner]230231pass2_mapped: str = map_winner(pass2_result["winner"])232print(f"Pass 2 mapped winner: {pass2_mapped}")233234# Check consistency235consistent: bool = pass1_result["winner"] == pass2_mapped236237final_result: dict[str, Any]238if consistent:239final_result = {240"winner": pass1_result["winner"],241"confidence": (pass1_result["confidence"] + pass2_result["confidence"]) / 2,242"position_consistent": True,243}244else:245final_result = {246"winner": "TIE",247"confidence": 0.5,248"position_consistent": False,249"bias_detected": True,250}251252print(f"\nFinal Result: {final_result}")253return final_result254255256# =============================================================================257# RUBRIC GENERATION258# =============================================================================259260261def rubric_generation_example() -> dict[str, Any]:262"""Generate a domain-specific scoring rubric for consistent evaluation.263264Use when: establishing evaluation standards for a new criterion, reducing265scoring variance (rubrics cut variance by 40-60%), or onboarding new266evaluators to an existing evaluation pipeline.267268Returns:269Dictionary containing score levels, characteristics, examples,270scoring guidelines, and edge case handling.271"""272273criterion_name: str = "Code Readability"274criterion_description: str = "How easy the code is to understand and maintain"275domain: str = "software engineering"276scale: str = "1-5"277strictness: str = "balanced"278279system_prompt: str = (280f"You are an expert in creating evaluation rubrics.\n"281f"Create clear, actionable rubrics with distinct boundaries between levels.\n\n"282f"Strictness: {strictness}\n"283f"- lenient: Lower bar for passing scores\n"284f"- balanced: Fair, typical expectations\n"285f"- strict: High standards, critical evaluation"286)287288user_prompt: str = f"""Create a scoring rubric for:289290**Criterion**: {criterion_name}291**Description**: {criterion_description}292**Scale**: {scale}293**Domain**: {domain}294295Generate:2961. Clear descriptions for each score level2972. Specific characteristics that define each level2983. Brief example text for each level2994. General scoring guidelines3005. Edge cases with guidance"""301302# Expected rubric structure303rubric: dict[str, Any] = {304"criterion": criterion_name,305"scale": {"min": 1, "max": 5},306"levels": [307{308"score": 1,309"label": "Poor",310"description": "Code is difficult to understand without significant effort",311"characteristics": [312"No meaningful variable or function names",313"No comments or documentation",314"Deeply nested or convoluted logic",315],316"example": "def f(x): return x[0]*x[1]+x[2]",317},318{319"score": 3,320"label": "Adequate",321"description": "Code is understandable with some effort",322"characteristics": [323"Most variables have meaningful names",324"Basic comments for complex sections",325"Logic is followable but could be cleaner",326],327"example": (328"def calc_total(items): # calculate sum\n"329" total = 0\n"330" for i in items: total += i\n"331" return total"332),333},334{335"score": 5,336"label": "Excellent",337"description": "Code is immediately clear and maintainable",338"characteristics": [339"All names are descriptive and consistent",340"Comprehensive documentation",341"Clean, modular structure",342],343"example": (344"def calculate_total_price(items: List[Item]) -> Decimal:\n"345" '''Calculate the total price of all items.'''\n"346" return sum(item.price for item in items)"347),348},349],350"scoring_guidelines": [351"Focus on readability, not cleverness",352"Consider the intended audience (team skill level)",353"Consistency matters more than style preference",354],355"edge_cases": [356{357"situation": "Code uses domain-specific abbreviations",358"guidance": "Score based on readability for domain experts, not general audience",359},360{361"situation": "Code is auto-generated",362"guidance": "Apply same standards but note in evaluation",363},364],365}366367print("Generated Rubric:")368for level in rubric["levels"]:369print(f" {level['score']}: {level['label']} - {level['description']}")370371return rubric372373374# =============================================================================375# MAIN376# =============================================================================377378if __name__ == "__main__":379print("=" * 60)380print("DIRECT SCORING EXAMPLE")381print("=" * 60)382direct_scoring_example()383384print("\n" + "=" * 60)385print("PAIRWISE COMPARISON EXAMPLE")386print("=" * 60)387pairwise_comparison_example()388389print("\n" + "=" * 60)390print("RUBRIC GENERATION EXAMPLE")391print("=" * 60)392rubric_generation_example()393