Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
241
Skill
n/a
Size
2.6 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
skills/advanced-evaluation/scripts/evaluation_example.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code393 linesFree
skills/advanced-evaluation/scripts/evaluation_example.py
1"""Advanced Evaluation Example
2 
3Use when: building LLM-as-judge evaluation pipelines, comparing model outputs
4with position-bias mitigation, or generating domain-specific scoring rubrics.
5 
6This module demonstrates the three core evaluation patterns from the
7advanced-evaluation skill: direct scoring, pairwise comparison with position
8swapping, and rubric generation. All functions use pseudocode-style examples
9that work across Python environments without specific dependencies.
10"""
11 
12from __future__ import annotations
13 
14from typing import Any
15 
16__all__ = [
17    "direct_scoring_example",
18    "pairwise_comparison_example",
19    "rubric_generation_example",
20]
21 
22 
23# =============================================================================
24# DIRECT SCORING EXAMPLE
25# =============================================================================
26 
27 
28def direct_scoring_example() -> dict[str, Any]:
29    """Rate a single response against defined criteria using direct scoring.
30 
31    Use when: evaluating objective criteria like factual accuracy, instruction
32    following, or toxicity where a clear ground truth or rubric exists.
33 
34    Returns:
35        Dictionary containing per-criterion scores, evidence, justifications,
36        and a weighted summary.
37    """
38 
39    # Input
40    prompt: str = "Explain quantum entanglement to a high school student"
41    response: str = (
42        "Quantum entanglement is like having two magical coins that are connected. "
43        "When you flip one and it lands on heads, the other instantly shows tails, "
44        'no matter how far apart they are. Scientists call this "spooky action at a distance."'
45    )
46 
47    criteria: list[dict[str, Any]] = [
48        {"name": "Accuracy", "description": "Scientific correctness", "weight": 0.4},
49        {"name": "Clarity", "description": "Understandable for audience", "weight": 0.3},
50        {"name": "Engagement", "description": "Interesting and memorable", "weight": 0.3},
51    ]
52 
53    # System prompt for the evaluator
54    system_prompt: str = (
55        "You are an expert evaluator. Assess the response against each criterion.\n\n"
56        "For each criterion:\n"
57        "1. Find specific evidence in the response\n"
58        "2. Score according to the rubric (1-5 scale)\n"
59        "3. Justify your score with evidence\n"
60        "4. Suggest one specific improvement\n\n"
61        "Be objective and consistent. Base scores on explicit evidence."
62    )
63 
64    # User prompt structure
65    user_prompt: str = f"""## Original Prompt
66{prompt}
67 
68## Response to Evaluate
69{response}
70 
71## Criteria
721. **Accuracy** (weight: 0.4): Scientific correctness
732. **Clarity** (weight: 0.3): Understandable for audience
743. **Engagement** (weight: 0.3): Interesting and memorable
75 
76## Output Format
77Respond with valid JSON:
78{{
79  "scores": [
80    {{
81      "criterion": "Accuracy",
82      "score": 4,
83      "evidence": ["quote or observation"],
84      "justification": "why this score",
85      "improvement": "specific suggestion"
86    }}
87  ],
88  "summary": {{
89    "assessment": "overall quality summary",
90    "strengths": ["strength 1"],
91    "weaknesses": ["weakness 1"]
92  }}
93}}"""
94 
95    # Expected output structure
96    expected_output: dict[str, Any] = {
97        "scores": [
98            {
99                "criterion": "Accuracy",
100                "score": 4,
101                "evidence": ["Correctly uses analogy", "Mentions spooky action at a distance"],
102                "justification": "Core concept is correct, analogy is appropriate",
103                "improvement": "Could mention it's a quantum mechanical phenomenon",
104            },
105            {
106                "criterion": "Clarity",
107                "score": 5,
108                "evidence": ["Simple coin analogy", "No jargon"],
109                "justification": "Appropriate for high school level",
110                "improvement": "None needed",
111            },
112            {
113                "criterion": "Engagement",
114                "score": 4,
115                "evidence": ["Magical coins", "Spooky action quote"],
116                "justification": "Memorable imagery and Einstein quote",
117                "improvement": "Could add a real-world application",
118            },
119        ],
120        "summary": {
121            "assessment": "Good explanation suitable for the target audience",
122            "strengths": ["Clear analogy", "Age-appropriate language"],
123            "weaknesses": ["Could be more comprehensive"],
124        },
125    }
126 
127    # Calculate weighted score
128    total_weight: float = sum(c["weight"] for c in criteria)
129    weighted_score: float = sum(
130        s["score"] * next(c["weight"] for c in criteria if c["name"] == s["criterion"])
131        for s in expected_output["scores"]
132    ) / total_weight
133 
134    print(f"Weighted Score: {weighted_score:.2f}/5")
135    return expected_output
136 
137 
138# =============================================================================
139# PAIRWISE COMPARISON WITH POSITION BIAS MITIGATION
140# =============================================================================
141 
142 
143def pairwise_comparison_example() -> dict[str, Any]:
144    """Compare two responses with position-swapped bias mitigation.
145 
146    Use when: evaluating subjective preferences like tone, style, or
147    persuasiveness where pairwise comparison achieves higher human-judge
148    agreement than direct scoring.
149 
150    Returns:
151        Dictionary containing the winner, confidence score, and whether
152        position consistency was achieved across both passes.
153    """
154 
155    prompt: str = "Explain machine learning to a beginner"
156 
157    response_a: str = (
158        "Machine learning is a subset of artificial intelligence that enables "
159        "systems to learn and improve from experience without being explicitly "
160        "programmed. It uses statistical techniques to give computers the ability "
161        "to identify patterns in data."
162    )
163 
164    response_b: str = (
165        "Imagine teaching a dog a new trick. You show the dog what to do, give "
166        "treats when it's right, and eventually it learns. Machine learning works "
167        "similarly - we show computers lots of examples, tell them when they're "
168        "right, and they learn to recognize patterns on their own."
169    )
170 
171    criteria: list[str] = ["clarity", "accessibility", "accuracy"]
172 
173    # System prompt emphasizing bias awareness
174    system_prompt: str = (
175        "You are an expert evaluator comparing two AI responses.\n\n"
176        "CRITICAL INSTRUCTIONS:\n"
177        "- Do NOT prefer responses because they are longer\n"
178        "- Do NOT prefer responses based on position (first vs second)\n"
179        "- Focus ONLY on quality according to the specified criteria\n"
180        "- Ties are acceptable when responses are genuinely equivalent"
181    )
182 
183    # Build evaluation prompt for a given ordering
184    def evaluate_pass(
185        first_response: str,
186        second_response: str,
187        first_label: str,
188        second_label: str,
189    ) -> str:
190        """Build evaluation prompt for one pass of position-swapped comparison.
191 
192        Use when: constructing the prompt for a single evaluation pass before
193        swapping response positions for bias mitigation.
194        """
195        return f"""## Original Prompt
196{prompt}
197 
198## Response {first_label}
199{first_response}
200 
201## Response {second_label}
202{second_response}
203 
204## Comparison Criteria
205{', '.join(criteria)}
206 
207## Output Format
208{{
209  "comparison": [
210    {{"criterion": "clarity", "winner": "A|B|TIE", "reasoning": "..."}}
211  ],
212  "result": {{
213    "winner": "A|B|TIE",
214    "confidence": 0.0-1.0,
215    "reasoning": "overall reasoning"
216  }}
217}}"""
218 
219    # Position bias mitigation protocol
220    print("Pass 1: A in first position")
221    pass1_result: dict[str, Any] = {"winner": "B", "confidence": 0.8}
222 
223    print("Pass 2: B in first position (swapped)")
224    pass2_result: dict[str, Any] = {"winner": "A", "confidence": 0.75}  # A because B was first
225 
226    # Map pass2 result back (swap labels)
227    def map_winner(winner: str) -> str:
228        """Map winner label after position swap."""
229        return {"A": "B", "B": "A", "TIE": "TIE"}[winner]
230 
231    pass2_mapped: str = map_winner(pass2_result["winner"])
232    print(f"Pass 2 mapped winner: {pass2_mapped}")
233 
234    # Check consistency
235    consistent: bool = pass1_result["winner"] == pass2_mapped
236 
237    final_result: dict[str, Any]
238    if consistent:
239        final_result = {
240            "winner": pass1_result["winner"],
241            "confidence": (pass1_result["confidence"] + pass2_result["confidence"]) / 2,
242            "position_consistent": True,
243        }
244    else:
245        final_result = {
246            "winner": "TIE",
247            "confidence": 0.5,
248            "position_consistent": False,
249            "bias_detected": True,
250        }
251 
252    print(f"\nFinal Result: {final_result}")
253    return final_result
254 
255 
256# =============================================================================
257# RUBRIC GENERATION
258# =============================================================================
259 
260 
261def rubric_generation_example() -> dict[str, Any]:
262    """Generate a domain-specific scoring rubric for consistent evaluation.
263 
264    Use when: establishing evaluation standards for a new criterion, reducing
265    scoring variance (rubrics cut variance by 40-60%), or onboarding new
266    evaluators to an existing evaluation pipeline.
267 
268    Returns:
269        Dictionary containing score levels, characteristics, examples,
270        scoring guidelines, and edge case handling.
271    """
272 
273    criterion_name: str = "Code Readability"
274    criterion_description: str = "How easy the code is to understand and maintain"
275    domain: str = "software engineering"
276    scale: str = "1-5"
277    strictness: str = "balanced"
278 
279    system_prompt: str = (
280        f"You are an expert in creating evaluation rubrics.\n"
281        f"Create clear, actionable rubrics with distinct boundaries between levels.\n\n"
282        f"Strictness: {strictness}\n"
283        f"- lenient: Lower bar for passing scores\n"
284        f"- balanced: Fair, typical expectations\n"
285        f"- strict: High standards, critical evaluation"
286    )
287 
288    user_prompt: str = f"""Create a scoring rubric for:
289 
290**Criterion**: {criterion_name}
291**Description**: {criterion_description}
292**Scale**: {scale}
293**Domain**: {domain}
294 
295Generate:
2961. Clear descriptions for each score level
2972. Specific characteristics that define each level
2983. Brief example text for each level
2994. General scoring guidelines
3005. Edge cases with guidance"""
301 
302    # Expected rubric structure
303    rubric: dict[str, Any] = {
304        "criterion": criterion_name,
305        "scale": {"min": 1, "max": 5},
306        "levels": [
307            {
308                "score": 1,
309                "label": "Poor",
310                "description": "Code is difficult to understand without significant effort",
311                "characteristics": [
312                    "No meaningful variable or function names",
313                    "No comments or documentation",
314                    "Deeply nested or convoluted logic",
315                ],
316                "example": "def f(x): return x[0]*x[1]+x[2]",
317            },
318            {
319                "score": 3,
320                "label": "Adequate",
321                "description": "Code is understandable with some effort",
322                "characteristics": [
323                    "Most variables have meaningful names",
324                    "Basic comments for complex sections",
325                    "Logic is followable but could be cleaner",
326                ],
327                "example": (
328                    "def calc_total(items): # calculate sum\n"
329                    "    total = 0\n"
330                    "    for i in items: total += i\n"
331                    "    return total"
332                ),
333            },
334            {
335                "score": 5,
336                "label": "Excellent",
337                "description": "Code is immediately clear and maintainable",
338                "characteristics": [
339                    "All names are descriptive and consistent",
340                    "Comprehensive documentation",
341                    "Clean, modular structure",
342                ],
343                "example": (
344                    "def calculate_total_price(items: List[Item]) -> Decimal:\n"
345                    "    '''Calculate the total price of all items.'''\n"
346                    "    return sum(item.price for item in items)"
347                ),
348            },
349        ],
350        "scoring_guidelines": [
351            "Focus on readability, not cleverness",
352            "Consider the intended audience (team skill level)",
353            "Consistency matters more than style preference",
354        ],
355        "edge_cases": [
356            {
357                "situation": "Code uses domain-specific abbreviations",
358                "guidance": "Score based on readability for domain experts, not general audience",
359            },
360            {
361                "situation": "Code is auto-generated",
362                "guidance": "Apply same standards but note in evaluation",
363            },
364        ],
365    }
366 
367    print("Generated Rubric:")
368    for level in rubric["levels"]:
369        print(f"  {level['score']}: {level['label']} - {level['description']}")
370 
371    return rubric
372 
373 
374# =============================================================================
375# MAIN
376# =============================================================================
377 
378if __name__ == "__main__":
379    print("=" * 60)
380    print("DIRECT SCORING EXAMPLE")
381    print("=" * 60)
382    direct_scoring_example()
383 
384    print("\n" + "=" * 60)
385    print("PAIRWISE COMPARISON EXAMPLE")
386    print("=" * 60)
387    pairwise_comparison_example()
388 
389    print("\n" + "=" * 60)
390    print("RUBRIC GENERATION EXAMPLE")
391    print("=" * 60)
392    rubric_generation_example()
393
Preparing the source view

Agent Skills for Context Engineering

skills/advanced-evaluation/scripts/evaluation_example.py