Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/interleaved-thinking/reasoning_trace_optimizer/analyzer.py
1"""2TraceAnalyzer: Analyzes reasoning traces to detect patterns and issues.34Uses M2.1's own interleaved thinking to analyze agent reasoning traces,5detecting patterns like context degradation, tool confusion, and instruction drift.6"""78import json9import os10from typing import Any1112import anthropic1314from reasoning_trace_optimizer.models import (15AnalysisResult,16Pattern,17PatternType,18ReasoningTrace,19Severity,20)212223ANALYSIS_SYSTEM_PROMPT = """You are an expert AI agent debugger specializing in analyzing reasoning traces.2425Your task is to analyze an agent's interleaved thinking trace and identify:261. **Patterns of failure** - detect specific failure modes with evidence272. **Quality scores** - rate the agent's reasoning on multiple dimensions283. **Actionable recommendations** - specific improvements for prompts/instructions2930## Pattern Definitions3132Detect these patterns with specific evidence from thinking blocks:3334- **context_degradation**: Agent loses or forgets information from earlier in the conversation35- Look for: Repeated questions, contradicting earlier statements, missing key details36- **tool_confusion**: Agent misunderstands what a tool does or how to use it37- Look for: Wrong tool selection, incorrect parameters, misinterpreting results38- **instruction_drift**: Agent gradually deviates from original instructions/persona39- Look for: Changing behavior, ignoring constraints, different tone over time40- **hallucination**: Agent generates information not supported by context or tools41- Look for: Made-up facts, fabricated tool results, unsourced claims42- **incomplete_reasoning**: Agent reaches conclusions without thorough analysis43- Look for: Skipped steps, missing validation, superficial exploration44- **tool_misuse**: Agent uses tools incorrectly or inefficiently45- Look for: Redundant calls, wrong parameters, unused results46- **goal_abandonment**: Agent stops pursuing the original objective47- Look for: Topic drift, giving up, switching goals without reason48- **circular_reasoning**: Agent repeats similar actions without progress49- Look for: Same queries repeated, looping behavior, no new information50- **premature_conclusion**: Agent concludes before completing the task51- Look for: Early stops, incomplete answers, skipped requirements52- **missing_validation**: Agent doesn't verify results or assumptions53- Look for: No cross-checking, accepting first result, no error handling5455## Analysis Focus5657You have access to the FULL reasoning trace including all thinking blocks between tool calls.58This gives you unique insight into HOW the agent reasons, not just what it outputs.5960For each thinking block, examine:61- What is the agent's current understanding?62- How does it interpret tool results?63- What alternatives does it consider?64- Does it maintain awareness of the original goal?6566Provide your analysis in the specified JSON format with concrete evidence."""676869ANALYSIS_PROMPT_TEMPLATE = """Analyze the following agent reasoning trace:7071## Task72{task}7374## System Prompt Given to Agent75{system_prompt}7677## Reasoning Trace78{trace}7980## Tool Calls Made81{tool_calls}8283## Final Outcome84Success: {success}85Final Response: {final_response}86Error (if any): {error}8788---8990Provide your analysis as JSON with this exact structure:91```json92{{93"patterns": [94{{95"type": "<one of: context_degradation, tool_confusion, instruction_drift, hallucination, incomplete_reasoning, tool_misuse, goal_abandonment, circular_reasoning, premature_conclusion, missing_validation>",96"severity": "<one of: low, medium, high, critical>",97"description": "<what the pattern is>",98"evidence": ["<excerpt from thinking>", "<another excerpt>"],99"turn_indices": [0, 2],100"suggestion": "<how to fix this>",101"confidence": 0.85102}}103],104"scores": {{105"reasoning_clarity": 75,106"goal_adherence": 80,107"tool_usage_quality": 60,108"error_recovery": 50,109"overall": 66110}},111"strengths": ["<strength 1>", "<strength 2>"],112"weaknesses": ["<weakness 1>", "<weakness 2>"],113"recommendations": [114"<specific actionable recommendation>",115"<another recommendation>"116]117}}118```119120Think carefully about each aspect before providing your analysis."""121122123class TraceAnalyzer:124"""125Analyzes reasoning traces using M2.1 to detect patterns and score quality.126127The analyzer uses M2.1's interleaved thinking to deeply understand128the agent's reasoning process and identify issues that wouldn't be129visible from outputs alone.130131Example:132```python133analyzer = TraceAnalyzer()134result = analyzer.analyze(trace)135136print(f"Overall score: {result.overall_score}")137for pattern in result.patterns:138print(f"Found: {pattern.type.value} ({pattern.severity.value})")139```140"""141142def __init__(143self,144api_key: str | None = None,145base_url: str = "https://api.minimax.io/anthropic",146model: str = "MiniMax-M2.1",147):148"""149Initialize TraceAnalyzer with M2.1 configuration.150151Args:152api_key: MiniMax API key153base_url: API endpoint154model: Model for analysis (M2.1 recommended for best results)155"""156self.model = model157self.client = anthropic.Anthropic(158api_key=api_key or os.environ.get("ANTHROPIC_API_KEY"),159base_url=base_url,160)161162def analyze(163self,164trace: ReasoningTrace,165max_tokens: int = 8192,166) -> AnalysisResult:167"""168Analyze a reasoning trace and return detailed analysis.169170Args:171trace: The reasoning trace to analyze172max_tokens: Maximum tokens for analysis response173174Returns:175AnalysisResult with patterns, scores, and recommendations176"""177# Format trace for analysis178trace_text = self._format_trace_for_analysis(trace)179tool_calls_text = self._format_tool_calls(trace)180181prompt = ANALYSIS_PROMPT_TEMPLATE.format(182task=trace.task,183system_prompt=trace.system_prompt,184trace=trace_text,185tool_calls=tool_calls_text,186success=trace.success,187final_response=trace.final_response or "None",188error=trace.error or "None",189)190191# Call M2.1 for analysis192response = self.client.messages.create(193model=self.model,194max_tokens=max_tokens,195system=ANALYSIS_SYSTEM_PROMPT,196messages=[{"role": "user", "content": prompt}],197)198199# Extract thinking and text from response200analyzer_thinking = ""201analysis_text = ""202203for block in response.content:204if block.type == "thinking":205analyzer_thinking = block.thinking206elif block.type == "text":207analysis_text = block.text208209# Parse the JSON response210result = self._parse_analysis_response(analysis_text, trace.session_id)211result.analyzer_thinking = analyzer_thinking212result.analyzer_model = self.model213214return result215216def analyze_batch(217self,218traces: list[ReasoningTrace],219) -> list[AnalysisResult]:220"""Analyze multiple traces and return results."""221return [self.analyze(trace) for trace in traces]222223def quick_score(224self,225trace: ReasoningTrace,226) -> float:227"""228Get a quick overall score without full pattern analysis.229230Useful for optimization loops where you need fast feedback.231232Args:233trace: The reasoning trace to score234235Returns:236Overall score from 0-100237"""238quick_prompt = f"""Rate this agent's performance from 0-100 based on its reasoning trace.239240Task: {trace.task}241Success: {trace.success}242Turns: {trace.total_turns}243244Thinking excerpts:245{self._get_thinking_excerpts(trace, max_chars=2000)}246247Respond with ONLY a number from 0-100."""248249response = self.client.messages.create(250model=self.model,251max_tokens=100,252messages=[{"role": "user", "content": quick_prompt}],253)254255# Extract score from response256for block in response.content:257if block.type == "text":258try:259score = float(block.text.strip())260return min(100, max(0, score))261except ValueError:262pass263264return 50.0 # Default middle score if parsing fails265266def _format_trace_for_analysis(self, trace: ReasoningTrace) -> str:267"""Format thinking blocks for analysis."""268parts = []269for i, thinking in enumerate(trace.thinking_blocks):270parts.append(f"[Turn {thinking.turn_index}] Thinking:")271parts.append(thinking.content)272parts.append("")273274return "\n".join(parts)275276def _format_tool_calls(self, trace: ReasoningTrace) -> str:277"""Format tool calls for analysis."""278if not trace.tool_calls:279return "No tool calls made."280281parts = []282for tc in trace.tool_calls:283status = "Success" if tc.success else f"Failed: {tc.error}"284parts.append(285f"- {tc.name}({json.dumps(tc.input)}) -> {status}\n"286f" Result: {tc.result[:200] if tc.result else 'None'}..."287)288289return "\n".join(parts)290291def _get_thinking_excerpts(self, trace: ReasoningTrace, max_chars: int = 2000) -> str:292"""Get excerpts from thinking blocks."""293excerpts = []294remaining = max_chars295296for thinking in trace.thinking_blocks:297if remaining <= 0:298break299excerpt = thinking.content[:remaining]300excerpts.append(f"[Turn {thinking.turn_index}]: {excerpt}")301remaining -= len(excerpt) + 20302303return "\n\n".join(excerpts)304305def _parse_analysis_response(306self,307response_text: str,308trace_id: str,309) -> AnalysisResult:310"""Parse the JSON analysis response from M2.1."""311result = AnalysisResult(trace_id=trace_id)312313try:314# Extract JSON from response (may have markdown code blocks)315json_text = response_text316if "```json" in response_text:317json_text = response_text.split("```json")[1].split("```")[0]318elif "```" in response_text:319json_text = response_text.split("```")[1].split("```")[0]320321data = json.loads(json_text)322323# Parse patterns324for p in data.get("patterns", []):325try:326pattern = Pattern(327type=PatternType(p["type"]),328severity=Severity(p["severity"]),329description=p["description"],330evidence=p.get("evidence", []),331turn_indices=p.get("turn_indices", []),332suggestion=p.get("suggestion", ""),333confidence=p.get("confidence", 0.5),334)335result.patterns.append(pattern)336except (KeyError, ValueError):337continue338339# Parse scores340scores = data.get("scores", {})341result.reasoning_clarity = scores.get("reasoning_clarity", 0)342result.goal_adherence = scores.get("goal_adherence", 0)343result.tool_usage_quality = scores.get("tool_usage_quality", 0)344result.error_recovery = scores.get("error_recovery", 0)345result.overall_score = scores.get("overall", 0)346347# Parse feedback348result.strengths = data.get("strengths", [])349result.weaknesses = data.get("weaknesses", [])350result.recommendations = data.get("recommendations", [])351352except (json.JSONDecodeError, KeyError) as e:353# If parsing fails, try fallback extraction and set reasonable defaults354result = self._fallback_parse_analysis(response_text, trace_id, str(e))355356# Warn if score is suspiciously low (likely parsing failure)357if result.overall_score == 0 and not result.patterns:358result.weaknesses.append("WARNING: Analysis may have failed - score is 0 with no patterns detected")359# Try to extract a score from the response text as fallback360fallback_score = self._extract_fallback_score(response_text)361if fallback_score > 0:362result.overall_score = fallback_score363result.recommendations.append(f"Score extracted via fallback: {fallback_score}")364365return result366367def _fallback_parse_analysis(368self,369response_text: str,370trace_id: str,371error_msg: str,372) -> AnalysisResult:373"""Fallback parsing when JSON extraction fails."""374import re375376result = AnalysisResult(trace_id=trace_id)377378# Try to extract score from text patterns like "Overall Score: 75" or "overall": 75379score_patterns = [380r'overall["\s:]+(\d+)',381r'Overall Score[:\s]+(\d+)',382r'"overall"[:\s]+(\d+)',383r'Score[:\s]+(\d+)/100',384]385386for pattern in score_patterns:387match = re.search(pattern, response_text, re.IGNORECASE)388if match:389result.overall_score = min(100, max(0, int(match.group(1))))390break391392# If still no score, use a neutral default (not 0)393if result.overall_score == 0:394result.overall_score = 50 # Neutral default instead of 0395396result.recommendations = [397f"Analysis parsing failed ({error_msg}). Using fallback extraction.",398"Consider re-running analysis if results seem inconsistent."399]400result.weaknesses = ["JSON parsing failed - analysis may be incomplete"]401402return result403404def _extract_fallback_score(self, response_text: str) -> float:405"""Extract a score from response text when JSON parsing fails."""406import re407408patterns = [409r'overall["\s:]+(\d+)',410r'Overall Score[:\s]+(\d+)',411r'"overall"[:\s]+(\d+)',412r'(\d+)/100',413r'score[:\s]+(\d+)',414]415416for pattern in patterns:417match = re.search(pattern, response_text, re.IGNORECASE)418if match:419score = int(match.group(1))420if 0 <= score <= 100:421return float(score)422423return 0.0424425426def format_analysis_report(analysis: AnalysisResult) -> str:427"""Format an analysis result as a human-readable report."""428lines = [429"=" * 60,430"REASONING TRACE ANALYSIS REPORT",431"=" * 60,432"",433f"Overall Score: {analysis.overall_score}/100",434"",435"Scores:",436f" - Reasoning Clarity: {analysis.reasoning_clarity}/100",437f" - Goal Adherence: {analysis.goal_adherence}/100",438f" - Tool Usage Quality: {analysis.tool_usage_quality}/100",439f" - Error Recovery: {analysis.error_recovery}/100",440"",441]442443if analysis.patterns:444lines.append("Detected Patterns:")445for p in analysis.patterns:446lines.append(f"\n [{p.severity.value.upper()}] {p.type.value}")447lines.append(f" {p.description}")448lines.append(f" Suggestion: {p.suggestion}")449450if analysis.strengths:451lines.append("\nStrengths:")452for s in analysis.strengths:453lines.append(f" + {s}")454455if analysis.weaknesses:456lines.append("\nWeaknesses:")457for w in analysis.weaknesses:458lines.append(f" - {w}")459460if analysis.recommendations:461lines.append("\nRecommendations:")462for i, r in enumerate(analysis.recommendations, 1):463lines.append(f" {i}. {r}")464465return "\n".join(lines)466