Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
241
Skill
n/a
Size
2.6 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
skills/context-compression/scripts/compression_evaluator.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code863 linesFree
skills/context-compression/scripts/compression_evaluator.py
1"""
2Context Compression Evaluation
3 
4Public API for evaluating context compression quality using probe-based
5assessment. This module provides three composable components:
6 
7- **ProbeGenerator**: Extracts factual claims, file operations, and decisions
8  from conversation history, then generates typed probes for evaluation.
9  Use when: building a compression evaluation pipeline and needing to
10  automatically derive test questions from raw conversation history.
11 
12- **CompressionEvaluator**: Scores probe responses against a multi-dimensional
13  rubric (accuracy, context awareness, artifact trail, completeness,
14  continuity, instruction following). Use when: comparing compression methods
15  or validating that a compression strategy preserves critical information.
16 
17- **StructuredSummarizer**: Implements anchored iterative summarization with
18  explicit sections for session intent, file tracking, decisions, and next
19  steps. Use when: compressing long-running coding sessions where file
20  tracking and decision rationale must survive compression.
21 
22Top-level convenience function:
23- **evaluate_compression_quality**: End-to-end pipeline that generates probes,
24  collects model responses, evaluates them, and returns a scored summary with
25  recommendations. Use when: running a one-shot compression quality check
26  without wiring up individual components.
27 
28PRODUCTION NOTES:
29- The LLM judge calls are stubbed for demonstration. Production systems
30  should implement actual API calls to a frontier model.
31- Token estimation uses simplified heuristics. Production systems should
32  use model-specific tokenizers.
33- Ground truth extraction uses pattern matching. Production systems may
34  benefit from more sophisticated fact extraction.
35"""
36 
37from dataclasses import dataclass, field
38from typing import List, Dict, Optional, Callable
39from enum import Enum
40import json
41import re
42 
43__all__ = [
44    "ProbeType",
45    "Probe",
46    "CriterionResult",
47    "EvaluationResult",
48    "RUBRIC_CRITERIA",
49    "ProbeGenerator",
50    "CompressionEvaluator",
51    "StructuredSummarizer",
52    "evaluate_compression_quality",
53]
54 
55 
56class ProbeType(Enum):
57    """Types of evaluation probes for compression quality assessment."""
58    RECALL = "recall"
59    ARTIFACT = "artifact"
60    CONTINUATION = "continuation"
61    DECISION = "decision"
62 
63 
64@dataclass
65class Probe:
66    """A probe question for evaluating compression quality.
67 
68    Use when: constructing evaluation inputs for CompressionEvaluator.
69    Each probe targets a specific information category that compression
70    may have lost.
71    """
72    probe_type: ProbeType
73    question: str
74    ground_truth: Optional[str] = None
75    context_reference: Optional[str] = None
76 
77 
78@dataclass
79class CriterionResult:
80    """Result for a single evaluation criterion."""
81    criterion_id: str
82    score: float
83    reasoning: str
84 
85 
86@dataclass
87class EvaluationResult:
88    """Complete evaluation result for a probe response.
89 
90    Contains per-criterion scores, per-dimension aggregates, and an
91    overall aggregate score.
92    """
93    probe: Probe
94    response: str
95    criterion_results: List[CriterionResult]
96    aggregate_score: float
97    dimension_scores: Dict[str, float] = field(default_factory=dict)
98 
99 
100# Evaluation Rubrics
101 
102RUBRIC_CRITERIA: Dict[str, List[Dict]] = {
103    "accuracy": [
104        {
105            "id": "accuracy_factual",
106            "question": "Are facts, file paths, and technical details correct?",
107            "weight": 0.6
108        },
109        {
110            "id": "accuracy_technical",
111            "question": "Are code references and technical concepts correct?",
112            "weight": 0.4
113        }
114    ],
115    "context_awareness": [
116        {
117            "id": "context_conversation_state",
118            "question": "Does the response reflect current conversation state?",
119            "weight": 0.5
120        },
121        {
122            "id": "context_artifact_state",
123            "question": "Does the response reflect which files/artifacts were accessed?",
124            "weight": 0.5
125        }
126    ],
127    "artifact_trail": [
128        {
129            "id": "artifact_files_created",
130            "question": "Does the agent know which files were created?",
131            "weight": 0.3
132        },
133        {
134            "id": "artifact_files_modified",
135            "question": "Does the agent know which files were modified?",
136            "weight": 0.4
137        },
138        {
139            "id": "artifact_key_details",
140            "question": "Does the agent remember function names, variable names, error messages?",
141            "weight": 0.3
142        }
143    ],
144    "completeness": [
145        {
146            "id": "completeness_coverage",
147            "question": "Does the response address all parts of the question?",
148            "weight": 0.6
149        },
150        {
151            "id": "completeness_depth",
152            "question": "Is sufficient detail provided?",
153            "weight": 0.4
154        }
155    ],
156    "continuity": [
157        {
158            "id": "continuity_work_state",
159            "question": "Can the agent continue without re-fetching information?",
160            "weight": 0.4
161        },
162        {
163            "id": "continuity_todo_state",
164            "question": "Does the agent maintain awareness of pending tasks?",
165            "weight": 0.3
166        },
167        {
168            "id": "continuity_reasoning",
169            "question": "Does the agent retain rationale behind previous decisions?",
170            "weight": 0.3
171        }
172    ],
173    "instruction_following": [
174        {
175            "id": "instruction_format",
176            "question": "Does the response follow the requested format?",
177            "weight": 0.5
178        },
179        {
180            "id": "instruction_constraints",
181            "question": "Does the response respect stated constraints?",
182            "weight": 0.5
183        }
184    ]
185}
186 
187 
188class ProbeGenerator:
189    """Generate typed probes from conversation history.
190 
191    Use when: automatically deriving evaluation questions from raw
192    conversation history at compression points. Extracts facts, file
193    operations, and decisions via pattern matching, then produces
194    one probe per category.
195 
196    For production systems, replace the regex-based extraction with
197    an LLM-based extractor for higher recall.
198    """
199 
200    def __init__(self, conversation_history: str) -> None:
201        self.history = conversation_history
202        self.extracted_facts = self._extract_facts()
203        self.extracted_files = self._extract_files()
204        self.extracted_decisions = self._extract_decisions()
205 
206    def generate_probes(self) -> List[Probe]:
207        """Generate all probe types for evaluation.
208 
209        Use when: preparing evaluation inputs at a compression point.
210        Returns one probe per category (recall, artifact, continuation,
211        decision) based on extractable content from the history.
212        """
213        probes: List[Probe] = []
214 
215        # Recall probes
216        if self.extracted_facts:
217            probes.append(Probe(
218                probe_type=ProbeType.RECALL,
219                question="What was the original error or issue that started this session?",
220                ground_truth=self.extracted_facts.get("original_error"),
221                context_reference="session_start"
222            ))
223 
224        # Artifact probes
225        if self.extracted_files:
226            probes.append(Probe(
227                probe_type=ProbeType.ARTIFACT,
228                question="Which files have we modified? Describe what changed in each.",
229                ground_truth=json.dumps(self.extracted_files),
230                context_reference="file_operations"
231            ))
232 
233        # Continuation probes
234        probes.append(Probe(
235            probe_type=ProbeType.CONTINUATION,
236            question="What should we do next?",
237            ground_truth=self.extracted_facts.get("next_steps"),
238            context_reference="task_state"
239        ))
240 
241        # Decision probes
242        if self.extracted_decisions:
243            probes.append(Probe(
244                probe_type=ProbeType.DECISION,
245                question="What key decisions did we make and why?",
246                ground_truth=json.dumps(self.extracted_decisions),
247                context_reference="decision_points"
248            ))
249 
250        return probes
251 
252    def _extract_facts(self) -> Dict[str, str]:
253        """Extract factual claims from history."""
254        facts: Dict[str, str] = {}
255 
256        # Extract error patterns
257        error_patterns = [
258            r"error[:\s]+(.+?)(?:\n|$)",
259            r"(\d{3})\s+(Unauthorized|Not Found|Internal Server Error)",
260            r"exception[:\s]+(.+?)(?:\n|$)"
261        ]
262 
263        for pattern in error_patterns:
264            match = re.search(pattern, self.history, re.IGNORECASE)
265            if match:
266                facts["original_error"] = match.group(0).strip()
267                break
268 
269        # Extract next steps
270        next_step_patterns = [
271            r"next[:\s]+(.+?)(?:\n|$)",
272            r"TODO[:\s]+(.+?)(?:\n|$)",
273            r"remaining[:\s]+(.+?)(?:\n|$)"
274        ]
275 
276        for pattern in next_step_patterns:
277            match = re.search(pattern, self.history, re.IGNORECASE)
278            if match:
279                facts["next_steps"] = match.group(0).strip()
280                break
281 
282        return facts
283 
284    def _extract_files(self) -> List[Dict[str, str]]:
285        """Extract file operations from history."""
286        files: List[Dict[str, str]] = []
287 
288        # Common file patterns
289        file_patterns = [
290            r"(?:modified|changed|updated|edited)\s+([^\s]+\.[a-z]+)",
291            r"(?:created|added)\s+([^\s]+\.[a-z]+)",
292            r"(?:read|examined|opened)\s+([^\s]+\.[a-z]+)"
293        ]
294 
295        for pattern in file_patterns:
296            matches = re.findall(pattern, self.history, re.IGNORECASE)
297            for match in matches:
298                if match not in [f["path"] for f in files]:
299                    files.append({
300                        "path": match,
301                        "operation": "modified" if "modif" in pattern else "created" if "creat" in pattern else "read"
302                    })
303 
304        return files
305 
306    def _extract_decisions(self) -> List[Dict[str, str]]:
307        """Extract decision points from history."""
308        decisions: List[Dict[str, str]] = []
309 
310        decision_patterns = [
311            r"decided to\s+(.+?)(?:\n|$)",
312            r"chose\s+(.+?)(?:\n|$)",
313            r"going with\s+(.+?)(?:\n|$)",
314            r"will use\s+(.+?)(?:\n|$)"
315        ]
316 
317        for pattern in decision_patterns:
318            matches = re.findall(pattern, self.history, re.IGNORECASE)
319            for match in matches:
320                decisions.append({
321                    "decision": match.strip(),
322                    "context": pattern.split("\\s+")[0]
323                })
324 
325        return decisions[:5]  # Limit to 5 decisions
326 
327 
328class CompressionEvaluator:
329    """Evaluate compression quality using probes and LLM judge.
330 
331    Use when: comparing compression methods or validating that a specific
332    compression pass preserved critical information. Scores responses
333    across six dimensions (accuracy, context awareness, artifact trail,
334    completeness, continuity, instruction following) and produces an
335    aggregate quality score.
336 
337    The evaluate() method is the primary entry point. Call it once per
338    probe, then call get_summary() to retrieve aggregated results.
339    """
340 
341    def __init__(self, model: str = "gpt-5.2") -> None:
342        self.model = model
343        self.results: List[EvaluationResult] = []
344 
345    def evaluate(self,
346                 probe: Probe,
347                 response: str,
348                 compressed_context: str) -> EvaluationResult:
349        """Evaluate a single probe response against the rubric.
350 
351        Use when: scoring how well a model's response (given compressed
352        context) answers a probe question. Returns per-criterion scores,
353        per-dimension aggregates, and an overall score.
354 
355        Args:
356            probe: The probe question with expected ground truth.
357            response: The model's response to evaluate.
358            compressed_context: The compressed context that was provided
359                to the model when generating the response.
360 
361        Returns:
362            EvaluationResult with scores and reasoning across all
363            applicable dimensions.
364        """
365        # Get relevant criteria based on probe type
366        criteria = self._get_criteria_for_probe(probe.probe_type)
367 
368        # Evaluate each criterion
369        criterion_results: List[CriterionResult] = []
370        for criterion in criteria:
371            result = self._evaluate_criterion(
372                criterion,
373                probe,
374                response,
375                compressed_context
376            )
377            criterion_results.append(result)
378 
379        # Calculate dimension scores
380        dimension_scores = self._calculate_dimension_scores(criterion_results)
381 
382        # Calculate aggregate score
383        aggregate_score = sum(dimension_scores.values()) / len(dimension_scores) if dimension_scores else 0.0
384 
385        result = EvaluationResult(
386            probe=probe,
387            response=response,
388            criterion_results=criterion_results,
389            aggregate_score=aggregate_score,
390            dimension_scores=dimension_scores
391        )
392 
393        self.results.append(result)
394        return result
395 
396    def get_summary(self) -> Dict:
397        """Get summary of all evaluation results.
398 
399        Use when: all probes have been evaluated and an aggregate
400        report is needed to compare methods or make a go/no-go
401        decision on a compression strategy.
402 
403        Returns:
404            Dictionary with total evaluations, average score,
405            per-dimension averages, and weakest/strongest dimensions.
406        """
407        if not self.results:
408            return {"error": "No evaluations performed"}
409 
410        avg_score = sum(r.aggregate_score for r in self.results) / len(self.results)
411 
412        # Average dimension scores
413        dimension_totals: Dict[str, float] = {}
414        dimension_counts: Dict[str, int] = {}
415 
416        for result in self.results:
417            for dim, score in result.dimension_scores.items():
418                dimension_totals[dim] = dimension_totals.get(dim, 0) + score
419                dimension_counts[dim] = dimension_counts.get(dim, 0) + 1
420 
421        avg_dimensions = {
422            dim: dimension_totals[dim] / dimension_counts[dim]
423            for dim in dimension_totals
424        }
425 
426        return {
427            "total_evaluations": len(self.results),
428            "average_score": avg_score,
429            "dimension_averages": avg_dimensions,
430            "weakest_dimension": min(avg_dimensions, key=avg_dimensions.get) if avg_dimensions else None,
431            "strongest_dimension": max(avg_dimensions, key=avg_dimensions.get) if avg_dimensions else None,
432        }
433 
434    def _get_criteria_for_probe(self, probe_type: ProbeType) -> List[Dict]:
435        """Get relevant criteria for probe type."""
436        criteria: List[Dict] = []
437 
438        # All probes get accuracy and completeness
439        criteria.extend(RUBRIC_CRITERIA["accuracy"])
440        criteria.extend(RUBRIC_CRITERIA["completeness"])
441 
442        # Add type-specific criteria
443        if probe_type == ProbeType.ARTIFACT:
444            criteria.extend(RUBRIC_CRITERIA["artifact_trail"])
445        elif probe_type == ProbeType.CONTINUATION:
446            criteria.extend(RUBRIC_CRITERIA["continuity"])
447        elif probe_type == ProbeType.RECALL:
448            criteria.extend(RUBRIC_CRITERIA["context_awareness"])
449        elif probe_type == ProbeType.DECISION:
450            criteria.extend(RUBRIC_CRITERIA["context_awareness"])
451            criteria.extend(RUBRIC_CRITERIA["continuity"])
452 
453        criteria.extend(RUBRIC_CRITERIA["instruction_following"])
454 
455        return criteria
456 
457    def _evaluate_criterion(self,
458                            criterion: Dict,
459                            probe: Probe,
460                            response: str,
461                            context: str) -> CriterionResult:
462        """
463        Evaluate a single criterion using LLM judge.
464 
465        PRODUCTION NOTE: This is a stub implementation.
466        Production systems should call the actual LLM API:
467 
468        ```python
469        result = openai.chat.completions.create(
470            model="gpt-5.2",
471            messages=[
472                {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
473                {"role": "user", "content": self._format_judge_input(criterion, probe, response, context)}
474            ]
475        )
476        return self._parse_judge_output(result)
477        ```
478        """
479        # Stub implementation - in production, call LLM judge
480        score = self._heuristic_score(criterion, response, probe.ground_truth)
481        reasoning = f"Evaluated {criterion['id']} based on response content."
482 
483        return CriterionResult(
484            criterion_id=criterion["id"],
485            score=score,
486            reasoning=reasoning
487        )
488 
489    def _heuristic_score(self,
490                         criterion: Dict,
491                         response: str,
492                         ground_truth: Optional[str]) -> float:
493        """
494        Heuristic scoring for demonstration.
495 
496        Production systems should use LLM judge instead.
497        """
498        score = 3.0  # Base score
499 
500        # Adjust based on response length and content
501        if len(response) < 50:
502            score -= 1.0  # Too short
503        elif len(response) > 500:
504            score += 0.5  # Detailed
505 
506        # Check for technical content
507        if any(ext in response for ext in [".ts", ".py", ".js", ".md"]):
508            score += 0.5  # Contains file references
509 
510        overlap_ratio = self._ground_truth_overlap_ratio(response, ground_truth)
511        if overlap_ratio >= 0.75:
512            score += 1.0
513        elif overlap_ratio >= 0.4:
514            score += 0.5
515        elif ground_truth:
516            score -= 0.5
517 
518        return min(5.0, max(0.0, score))
519 
520    def _ground_truth_overlap_ratio(self,
521                                    response: str,
522                                    ground_truth: Optional[str]) -> float:
523        if not ground_truth:
524            return 0.0
525 
526        terms = self._extract_ground_truth_terms(ground_truth)
527        if not terms:
528            return 1.0 if ground_truth.lower() in response.lower() else 0.0
529 
530        response_lower = response.lower()
531        matches = sum(1 for term in terms if term in response_lower)
532        return matches / len(terms)
533 
534    def _extract_ground_truth_terms(self, ground_truth: str) -> List[str]:
535        try:
536            parsed = json.loads(ground_truth)
537        except json.JSONDecodeError:
538            return [ground_truth.lower()] if ground_truth.strip() else []
539 
540        terms: List[str] = []
541 
542        def collect(value) -> None:
543            if isinstance(value, str):
544                normalized = value.strip().lower()
545                if normalized:
546                    terms.append(normalized)
547            elif isinstance(value, dict):
548                for nested in value.values():
549                    collect(nested)
550            elif isinstance(value, list):
551                for nested in value:
552                    collect(nested)
553 
554        collect(parsed)
555        return list(dict.fromkeys(terms))
556 
557    def _calculate_dimension_scores(self,
558                                    criterion_results: List[CriterionResult]) -> Dict[str, float]:
559        """Calculate dimension scores from criterion results."""
560        dimension_scores: Dict[str, float] = {}
561 
562        for dimension, criteria in RUBRIC_CRITERIA.items():
563            criterion_ids = [c["id"] for c in criteria]
564            relevant_results = [
565                r for r in criterion_results
566                if r.criterion_id in criterion_ids
567            ]
568 
569            if relevant_results:
570                # Weighted average
571                total_weight = sum(
572                    c["weight"] for c in criteria
573                    if c["id"] in [r.criterion_id for r in relevant_results]
574                )
575                weighted_sum = sum(
576                    r.score * next(c["weight"] for c in criteria if c["id"] == r.criterion_id)
577                    for r in relevant_results
578                )
579                dimension_scores[dimension] = weighted_sum / total_weight if total_weight > 0 else 0.0
580 
581        return dimension_scores
582 
583 
584class StructuredSummarizer:
585    """Generate structured summaries with explicit sections.
586 
587    Use when: implementing anchored iterative summarization for
588    long-running coding sessions. Maintains a persistent summary
589    with dedicated sections for session intent, file modifications,
590    decisions, current state, and next steps.
591 
592    Call update_from_span() each time a new content span is truncated.
593    The summarizer merges new information into existing sections rather
594    than regenerating, preventing cumulative detail loss.
595    """
596 
597    TEMPLATE = """## Session Intent
598{intent}
599 
600## Files Modified
601{files_modified}
602 
603## Files Read (Not Modified)
604{files_read}
605 
606## Decisions Made
607{decisions}
608 
609## Current State
610{current_state}
611 
612## Next Steps
613{next_steps}
614"""
615 
616    def __init__(self) -> None:
617        self.sections: Dict = {
618            "intent": "",
619            "files_modified": [],
620            "files_read": [],
621            "decisions": [],
622            "current_state": "",
623            "next_steps": []
624        }
625 
626    def update_from_span(self, new_content: str) -> str:
627        """Update summary from newly truncated content span.
628 
629        Use when: a compression trigger fires and a portion of
630        conversation history is about to be discarded. Pass the
631        content that will be truncated; the summarizer extracts
632        structured information and merges it with prior state.
633 
634        Args:
635            new_content: The conversation span being truncated.
636 
637        Returns:
638            Formatted summary string with all sections populated.
639        """
640        # Extract information from new content
641        new_info = self._extract_from_content(new_content)
642 
643        # Merge with existing sections
644        self._merge_sections(new_info)
645 
646        # Generate formatted summary
647        return self._format_summary()
648 
649    def _extract_from_content(self, content: str) -> Dict:
650        """Extract structured information from content."""
651        extracted: Dict = {
652            "intent": "",
653            "files_modified": [],
654            "files_read": [],
655            "decisions": [],
656            "current_state": "",
657            "next_steps": []
658        }
659 
660        # Extract file modifications
661        mod_pattern = r"(?:modified|changed|updated|fixed)\s+([^\s]+\.[a-z]+)[:\s]*(.+?)(?:\n|$)"
662        for match in re.finditer(mod_pattern, content, re.IGNORECASE):
663            extracted["files_modified"].append({
664                "path": match.group(1),
665                "change": match.group(2).strip()[:100]
666            })
667 
668        # Extract file reads
669        read_pattern = r"(?:read|examined|opened|checked)\s+([^\s]+\.[a-z]+)"
670        for match in re.finditer(read_pattern, content, re.IGNORECASE):
671            file_path = match.group(1)
672            if file_path not in [f["path"] for f in extracted["files_modified"]]:
673                extracted["files_read"].append(file_path)
674 
675        # Extract decisions
676        decision_pattern = r"(?:decided|chose|going with|will use)\s+(.+?)(?:\n|$)"
677        for match in re.finditer(decision_pattern, content, re.IGNORECASE):
678            extracted["decisions"].append(match.group(1).strip()[:150])
679 
680        return extracted
681 
682    def _merge_sections(self, new_info: Dict) -> None:
683        """Merge new information with existing sections."""
684        # Update intent if empty
685        if new_info["intent"] and not self.sections["intent"]:
686            self.sections["intent"] = new_info["intent"]
687 
688        # Merge file lists (deduplicate by path)
689        existing_mod_paths = [f["path"] for f in self.sections["files_modified"]]
690        for file_info in new_info["files_modified"]:
691            if file_info["path"] not in existing_mod_paths:
692                self.sections["files_modified"].append(file_info)
693 
694        # Merge read files
695        for file_path in new_info["files_read"]:
696            if file_path not in self.sections["files_read"]:
697                self.sections["files_read"].append(file_path)
698 
699        # Append decisions
700        self.sections["decisions"].extend(new_info["decisions"])
701 
702        # Update current state (latest wins)
703        if new_info["current_state"]:
704            self.sections["current_state"] = new_info["current_state"]
705 
706        # Merge next steps
707        self.sections["next_steps"].extend(new_info["next_steps"])
708 
709    def _format_summary(self) -> str:
710        """Format sections into summary string."""
711        files_modified_str = "\n".join(
712            f"- {f['path']}: {f['change']}"
713            for f in self.sections["files_modified"]
714        ) or "None"
715 
716        files_read_str = "\n".join(
717            f"- {f}" for f in self.sections["files_read"]
718        ) or "None"
719 
720        decisions_str = "\n".join(
721            f"- {d}" for d in self.sections["decisions"][-5:]  # Keep last 5
722        ) or "None"
723 
724        next_steps_str = "\n".join(
725            f"{i+1}. {s}" for i, s in enumerate(self.sections["next_steps"][-5:])
726        ) or "None"
727 
728        return self.TEMPLATE.format(
729            intent=self.sections["intent"] or "Not specified",
730            files_modified=files_modified_str,
731            files_read=files_read_str,
732            decisions=decisions_str,
733            current_state=self.sections["current_state"] or "In progress",
734            next_steps=next_steps_str
735        )
736 
737 
738def evaluate_compression_quality(
739    original_history: str,
740    compressed_context: str,
741    model_response_fn: Callable[[str, str], str],
742) -> Dict:
743    """Evaluate compression quality for a conversation end-to-end.
744 
745    Use when: running a one-shot quality check on a compression pass.
746    Generates probes from original history, collects model responses
747    using the compressed context, evaluates each response, and returns
748    a scored summary with actionable recommendations.
749 
750    Args:
751        original_history: The full conversation before compression.
752        compressed_context: The compressed version to evaluate.
753        model_response_fn: Callable that takes (compressed_context, question)
754            and returns the model's response string.
755 
756    Returns:
757        Dictionary with total evaluations, average score, per-dimension
758        averages, weakest/strongest dimensions, and recommendations list.
759    """
760    # Generate probes
761    generator = ProbeGenerator(original_history)
762    probes = generator.generate_probes()
763 
764    # Evaluate each probe
765    evaluator = CompressionEvaluator()
766 
767    for probe in probes:
768        # Get model response using compressed context
769        response = model_response_fn(compressed_context, probe.question)
770 
771        # Evaluate response
772        evaluator.evaluate(probe, response, compressed_context)
773 
774    # Get summary
775    summary = evaluator.get_summary()
776 
777    # Add recommendations
778    summary["recommendations"] = []
779 
780    if summary.get("weakest_dimension") == "artifact_trail":
781        summary["recommendations"].append(
782            "Consider implementing separate artifact tracking outside compression"
783        )
784 
785    if summary.get("average_score", 0) < 3.5:
786        summary["recommendations"].append(
787            "Compression quality is below threshold - consider less aggressive compression"
788        )
789 
790    return summary
791 
792 
793if __name__ == "__main__":
794    # Demo: generate probes and evaluate a sample compression
795 
796    sample_history = """
797    User reported error: 401 Unauthorized on /api/auth/login endpoint.
798    Examined auth.controller.ts - JWT generation looks correct.
799    Examined middleware/cors.ts - no issues found.
800    Modified config/redis.ts: Fixed connection pooling configuration.
801    Modified services/session.service.ts: Added retry logic for transient failures.
802    Decided to use Redis connection pool instead of per-request connections.
803    Modified tests/auth.test.ts: Updated mock setup for new config.
804    14 tests passing, 2 failing (mock setup issues).
805    Next: Fix remaining test failures in session service mocks.
806    """
807 
808    sample_compressed = """
809    ## Session Intent
810    Debug 401 Unauthorized on /api/auth/login.
811 
812    ## Root Cause
813    Stale Redis connection in session store.
814 
815    ## Files Modified
816    - config/redis.ts: Fixed connection pooling
817    - services/session.service.ts: Added retry logic
818    - tests/auth.test.ts: Updated mock setup
819 
820    ## Test Status
821    14 passing, 2 failing
822 
823    ## Next Steps
824    1. Fix remaining test failures
825    """
826 
827    # Stub model response function
828    def mock_model_response(context: str, question: str) -> str:
829        if "error" in question.lower():
830            return "The original error was a 401 Unauthorized on /api/auth/login."
831        if "files" in question.lower():
832            return "Modified config/redis.ts, services/session.service.ts, tests/auth.test.ts."
833        if "next" in question.lower():
834            return "Fix remaining test failures in session service mocks."
835        if "decision" in question.lower():
836            return "Decided to use Redis connection pool instead of per-request connections."
837        return "No specific information available."
838 
839    # Run evaluation
840    result = evaluate_compression_quality(
841        original_history=sample_history,
842        compressed_context=sample_compressed,
843        model_response_fn=mock_model_response,
844    )
845 
846    print("=== Compression Quality Evaluation ===")
847    print(f"Total evaluations: {result['total_evaluations']}")
848    print(f"Average score: {result['average_score']:.2f}")
849    print()
850    print("Dimension averages:")
851    for dim, score in result.get("dimension_averages", {}).items():
852        print(f"  {dim}: {score:.2f}")
853    print()
854    print(f"Weakest dimension: {result.get('weakest_dimension')}")
855    print(f"Strongest dimension: {result.get('strongest_dimension')}")
856    print()
857    if result.get("recommendations"):
858        print("Recommendations:")
859        for rec in result["recommendations"]:
860            print(f"  - {rec}")
861    else:
862        print("No recommendations - compression quality looks acceptable.")
863
Preparing the source view

Agent Skills for Context Engineering

skills/context-compression/scripts/compression_evaluator.py