Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/interleaved-thinking/examples/03_full_optimization.py
1"""2Example 3: Full Optimization Loop with Comprehensive Tools34Demonstrates the complete optimization cycle with realistic tools:5- Web search for finding information6- URL reading for fetching content7- File system operations (read, write, list)8- Note-taking for tracking findings910This example uses REAL URLs and realistic content to demonstrate11how the Reasoning Trace Optimizer works in production scenarios.12"""1314import json15import os16import random17from datetime import datetime18from pathlib import Path1920from dotenv import load_dotenv2122from reasoning_trace_optimizer import (23OptimizationLoop,24LoopConfig,25SkillGenerator,26)2728# Load environment variables from the project root29env_path = Path(__file__).parent.parent / ".env"30load_dotenv(env_path)313233# =============================================================================34# COMPREHENSIVE TOOL DEFINITIONS35# =============================================================================3637TOOLS = [38# Web Search Tool39{40"name": "web_search",41"description": "Search the web for information. Returns a list of results with titles, URLs, and snippets. Use specific queries for better results.",42"input_schema": {43"type": "object",44"properties": {45"query": {46"type": "string",47"description": "Search query - be specific and use relevant keywords",48},49"num_results": {50"type": "integer",51"description": "Number of results to return (1-10, default 5)",52"default": 5,53},54},55"required": ["query"],56},57},58# Read URL Tool59{60"name": "read_url",61"description": "Fetch and read the content of a webpage. Returns the main text content. Use after web_search to get full details from a result.",62"input_schema": {63"type": "object",64"properties": {65"url": {66"type": "string",67"description": "The URL to fetch content from",68},69},70"required": ["url"],71},72},73# File Read Tool74{75"name": "read_file",76"description": "Read the contents of a local file. Supports text files, markdown, JSON, etc.",77"input_schema": {78"type": "object",79"properties": {80"path": {81"type": "string",82"description": "Path to the file to read",83},84},85"required": ["path"],86},87},88# File Write Tool89{90"name": "write_file",91"description": "Write content to a local file. Creates the file if it doesn't exist, overwrites if it does.",92"input_schema": {93"type": "object",94"properties": {95"path": {96"type": "string",97"description": "Path where to write the file",98},99"content": {100"type": "string",101"description": "Content to write to the file",102},103},104"required": ["path", "content"],105},106},107# List Directory Tool108{109"name": "list_directory",110"description": "List files and folders in a directory. Useful for exploring project structure.",111"input_schema": {112"type": "object",113"properties": {114"path": {115"type": "string",116"description": "Directory path to list (default: current directory)",117"default": ".",118},119},120"required": [],121},122},123# Save Note Tool124{125"name": "save_note",126"description": "Save a research note with title and content. Use to track important findings during research.",127"input_schema": {128"type": "object",129"properties": {130"title": {131"type": "string",132"description": "Title of the note",133},134"content": {135"type": "string",136"description": "Content of the note",137},138"tags": {139"type": "array",140"items": {"type": "string"},141"description": "Optional tags for categorization",142},143},144"required": ["title", "content"],145},146},147# Calculator Tool148{149"name": "calculator",150"description": "Perform mathematical calculations. Supports basic arithmetic and common functions.",151"input_schema": {152"type": "object",153"properties": {154"expression": {155"type": "string",156"description": "Mathematical expression to evaluate (e.g., '2 + 2', 'sqrt(16)', '100 * 0.15')",157},158},159"required": ["expression"],160},161},162]163164165# =============================================================================166# REAL-WORLD SIMULATED DATA167# Based on actual documentation and research from AI companies168# =============================================================================169170# Simulated web search results with REAL URLs171SEARCH_DATABASE = {172"context engineering ai": [173{174"title": "Context Engineering for AI Agents - Anthropic",175"url": "https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",176"snippet": "Prompt caching is a feature that optimizes API usage by allowing resuming from specific prefixes in your prompts. Cache the context you want to reuse across requests.",177},178{179"title": "Building Effective AI Agents - Anthropic Research",180"url": "https://www.anthropic.com/research/building-effective-agents",181"snippet": "A comprehensive guide to building effective AI agents. Covers tool use, context management, error handling, and best practices for production deployments.",182},183{184"title": "Large Language Models and Context Windows - OpenAI",185"url": "https://platform.openai.com/docs/guides/text-generation",186"snippet": "Understanding how context windows work in large language models. Learn about token limits, context management strategies, and optimizing for performance.",187},188],189"interleaved thinking agents": [190{191"title": "MiniMax M2.1 - Interleaved Thinking Model",192"url": "https://www.minimax.io/platform/docs/M2.1",193"snippet": "M2.1 introduces interleaved thinking - the ability for models to reason between tool calls, enabling better debugging and adaptability in agentic workflows.",194},195{196"title": "Chain of Thought Prompting - Google Research",197"url": "https://arxiv.org/abs/2201.11903",198"snippet": "Chain-of-thought prompting enables complex reasoning in large language models. This paper explores how step-by-step reasoning improves model performance.",199},200],201"prompt optimization techniques": [202{203"title": "Prompt Engineering Guide - DAIR.AI",204"url": "https://www.promptingguide.ai/techniques",205"snippet": "Comprehensive guide to prompt engineering techniques including zero-shot, few-shot, chain-of-thought, and advanced methods for optimizing LLM outputs.",206},207{208"title": "Best Practices for Prompt Engineering - OpenAI",209"url": "https://platform.openai.com/docs/guides/prompt-engineering",210"snippet": "Official OpenAI guide on prompt engineering best practices. Covers strategies for getting better results, handling edge cases, and iterative refinement.",211},212],213"agent debugging best practices": [214{215"title": "Debugging AI Agents - LangChain Documentation",216"url": "https://python.langchain.com/docs/how_to/debugging",217"snippet": "Learn how to debug LangChain agents effectively. Covers tracing, verbose mode, callbacks, and common debugging patterns for complex agent workflows.",218},219{220"title": "LLM Observability and Tracing - Weights & Biases",221"url": "https://docs.wandb.ai/guides/prompts",222"snippet": "Track and debug LLM applications with W&B Prompts. Visualize chains, compare outputs, and identify failure patterns in your AI applications.",223},224],225"context window optimization": [226{227"title": "Claude's Context Window - Anthropic Documentation",228"url": "https://docs.anthropic.com/en/docs/build-with-claude/context-windows",229"snippet": "Claude supports context windows up to 200K tokens. Learn how to effectively use large context windows and optimize token usage for cost and performance.",230},231{232"title": "Lost in the Middle: How Language Models Use Long Contexts",233"url": "https://arxiv.org/abs/2307.03172",234"snippet": "Research on how LLMs utilize information across long contexts. Models perform worse when relevant info is in the middle vs. beginning/end of context.",235},236],237}238239# Simulated webpage content based on REAL documentation240PAGE_CONTENT = {241"https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching": """242# Prompt Caching - Anthropic Documentation243244Prompt caching is a feature that optimizes API usage by allowing you to cache frequently used context.245246## Overview247248Prompt caching allows you to cache the system prompt, examples, and other static content that remains constant across multiple requests. This:249250- **Reduces latency** by up to 85% for cached content251- **Lowers costs** by avoiding re-processing of identical context252- **Improves throughput** for high-volume applications253254## How It Works255256When you enable prompt caching, the API stores a hash of your prompt prefix. On subsequent requests with the same prefix, the cached computation is reused.257258### Cache Breakpoints259260You can specify cache breakpoints using the `cache_control` parameter:261262```python263messages = [264{265"role": "user",266"content": [267{268"type": "text",269"text": "Your static context here...",270"cache_control": {"type": "ephemeral"}271}272]273}274]275```276277## Best Practices2782791. **Cache stable content**: Put instructions and examples that don't change in the cached portion2802. **Place dynamic content last**: User queries and variable data should come after cached content2813. **Monitor cache hits**: Use the response headers to track cache efficiency2824. **Minimum cache size**: Content must be at least 1024 tokens to be cached283284## Context Engineering Implications285286Effective prompt caching is a key part of context engineering. By understanding what to cache:287288- System prompts with role definitions289- Tool descriptions that remain constant290- Few-shot examples for consistent behavior291- Reference documentation the model needs292293You reduce both latency and cost while maintaining quality.294""",295"https://www.anthropic.com/research/building-effective-agents": """296# Building Effective AI Agents - Anthropic Research297298This guide covers best practices for building reliable, effective AI agents using Claude.299300## Core Principles301302### 1. Start Simple, Add Complexity Gradually303304Begin with the simplest possible agent architecture:305- Single tool with clear purpose306- Linear workflow without branching307- Explicit success criteria308309Only add complexity when you have evidence it's needed.310311### 2. Tool Design Matters312313Well-designed tools make agents more reliable:314315- **Clear descriptions**: Explain what the tool does AND when to use it316- **Typed inputs**: Use JSON Schema to define expected parameters317- **Informative outputs**: Return data the model can interpret and act on318- **Error messages**: Provide actionable guidance when things fail319320### 3. Context Management321322Context is your most precious resource:323324- **Token efficiency**: Every token costs money and attention325- **Structured format**: Use consistent formatting for easier parsing326- **Progressive disclosure**: Load information on-demand327- **Summarization**: Compress long histories while preserving key facts328329### 4. Error Handling330331Agents will encounter errors. Design for recovery:332333- Give the model explicit permission to retry334- Provide diagnostic information in error messages335- Set clear stopping conditions to prevent infinite loops336- Log everything for debugging337338## Common Anti-Patterns3393401. **Over-engineering**: Building complex multi-agent systems before validating single-agent performance3412. **Vague tools**: Tool descriptions that don't clarify when to use each tool3423. **Context overload**: Stuffing too much information into the prompt3434. **No exit conditions**: Letting agents run indefinitely without progress checks344345## Debugging Strategies346347### Trace Analysis348349The key to debugging agents is understanding their reasoning:3503511. Capture the full reasoning trace including thinking blocks3522. Identify where the agent's understanding diverged from reality3533. Look for patterns: tool confusion, goal drift, context loss3544. Iterate on prompts based on specific failure modes355356### Interleaved Thinking357358Models with interleaved thinking (reasoning between tool calls) provide better debugging insight because you can see:359360- How they interpreted each tool result361- What alternatives they considered362- When and why they changed approach363""",364"https://platform.openai.com/docs/guides/text-generation": """365# Text Generation - OpenAI Documentation366367Learn how to generate text with OpenAI's models.368369## Context Windows370371Each model has a context window that determines the maximum number of tokens it can process:372373| Model | Context Window |374|-------|----------------|375| GPT-4o | 128K tokens |376| GPT-4 Turbo | 128K tokens |377| GPT-3.5 Turbo | 16K tokens |378379### Managing Context380381For long conversations or documents:3823831. **Truncation**: Remove oldest messages when approaching the limit3842. **Summarization**: Replace old messages with summaries3853. **Retrieval**: Use RAG to fetch only relevant content386387### Token Counting388389Use the tiktoken library to count tokens before sending requests:390391```python392import tiktoken393394encoding = tiktoken.encoding_for_model("gpt-4")395num_tokens = len(encoding.encode("Your text here"))396```397398## Best Practices399400### Structured Prompts401402Organize your prompts with clear sections:403- System message: Role and general instructions404- Context: Background information needed405- Task: Specific request with format requirements406- Examples: Few-shot demonstrations if helpful407408### Temperature and Sampling409410- **temperature=0**: Deterministic, best for factual tasks411- **temperature=0.7**: Balanced creativity and coherence412- **temperature=1.0+**: More random, for creative tasks413""",414"https://www.minimax.io/platform/docs/M2.1": """415# MiniMax M2.1 - Interleaved Thinking Model416417M2.1 is a next-generation reasoning model that introduces **interleaved thinking** - continuous reasoning throughout task execution.418419## What is Interleaved Thinking?420421Traditional reasoning models think once at the start, then execute:422```423Think → Act → Act → Act → Done424```425426M2.1 thinks between every action:427```428Think → Act → Think → Act → Think → Act → Done429```430431## Why This Matters432433### 1. Better Debugging434435The thinking blocks expose the model's reasoning process. You can see:436- What it understood from tool results437- How it decided what to do next438- Where it might have gone wrong439440### 2. Adaptive Behavior441442By reasoning after each tool call, M2.1 can:443- React to unexpected outputs444- Recover from errors mid-execution445- Adjust strategy based on new information446447### 3. Long-Horizon Tasks448449For complex multi-step tasks, maintaining focus is crucial. Interleaved thinking:450- Reinforces the original goal451- Tracks progress toward completion452- Identifies when the task is done453454## API Usage455456### Anthropic SDK457458```python459import anthropic460461client = anthropic.Anthropic(462api_key="your-key",463base_url="https://api.minimax.io/anthropic"464)465466response = client.messages.create(467model="MiniMax-M2.1",468max_tokens=4096,469messages=[{"role": "user", "content": "Your task"}]470)471472# Access thinking blocks473for block in response.content:474if block.type == "thinking":475print(f"Thinking: {block.thinking}")476elif block.type == "text":477print(f"Response: {block.text}")478```479480## Best Practices4814821. **Preserve full context**: Always include thinking blocks in message history4832. **Clear tool descriptions**: Help the model understand when to use each tool4843. **Explicit success criteria**: Define what "done" looks like4854. **Error guidance**: Give clear instructions for handling failures486""",487"https://www.promptingguide.ai/techniques": """488# Prompt Engineering Techniques - DAIR.AI489490A comprehensive guide to prompt engineering techniques for large language models.491492## Basic Techniques493494### Zero-Shot Prompting495496Ask the model to perform a task without examples:497498```499Classify this text as positive, negative, or neutral:500"I really enjoyed the movie but the ending was disappointing."501```502503### Few-Shot Prompting504505Provide examples to guide the model:506507```508Classify sentiment:509"Great product!" → Positive510"Terrible service." → Negative511"It was okay." → Neutral512"I really enjoyed the movie but the ending was disappointing." →513```514515## Advanced Techniques516517### Chain-of-Thought (CoT)518519Encourage step-by-step reasoning:520521```522Solve this problem step by step:523If John has 5 apples and gives 2 to Mary, then buys 3 more, how many does he have?524525Let's think through this:5261. John starts with 5 apples5272. He gives 2 to Mary: 5 - 2 = 3 apples5283. He buys 3 more: 3 + 3 = 6 apples529Answer: 6 apples530```531532### Self-Consistency533534Generate multiple reasoning paths and take the majority answer. Improves reliability for complex reasoning tasks.535536### Tree of Thoughts537538Explore multiple reasoning branches simultaneously, evaluating and pruning paths to find optimal solutions.539540## Prompt Optimization541542### Iterative Refinement5435441. Start with a basic prompt5452. Test on representative examples5463. Analyze failures5474. Refine prompt based on patterns5485. Repeat until convergence549550### Common Failure Patterns551552| Pattern | Solution |553|---------|----------|554| Goal drift | Add explicit goal reminders |555| Hallucination | Require source citations |556| Incomplete output | Specify format requirements |557| Wrong tool usage | Improve tool descriptions |558""",559"https://platform.openai.com/docs/guides/prompt-engineering": """560# Prompt Engineering Best Practices - OpenAI561562Official guide to getting better results from large language models.563564## Six Strategies565566### 1. Write Clear Instructions567568Be specific about what you want:569- Include details in your query570- Ask the model to adopt a persona571- Use delimiters to mark distinct sections572- Specify desired output format and length573574### 2. Provide Reference Text575576Reduce hallucinations:577- Instruct the model to answer using provided text578- Ask for citations from the source material579- Use retrieval to inject relevant context580581### 3. Split Complex Tasks582583Break down hard problems:584- Use intent classification to route queries585- Summarize long documents in chunks586- Break multi-step tasks into sequential prompts587588### 4. Give the Model Time to Think589590Improve reasoning:591- Ask for a chain of reasoning592- Use inner monologue to hide intermediate steps593- Ask if previous steps were correct594595### 5. Use External Tools596597Augment model capabilities:598- Use code execution for accurate calculations599- Use retrieval for up-to-date information600- Use APIs for specific functionality601602### 6. Test Changes Systematically603604Evaluate prompt effectiveness:605- Define comprehensive test cases606- Measure against gold-standard answers607- Track metrics over prompt iterations608609## Anti-Patterns to Avoid6106111. **Ambiguous instructions**: "Make it better" vs "Improve clarity by adding examples"6122. **Too much context**: Relevant info gets lost in noise6133. **No output format**: Model guesses what you want6144. **Assuming knowledge**: Model doesn't know your codebase/domain615""",616"https://python.langchain.com/docs/how_to/debugging": """617# Debugging LangChain Agents618619Learn effective debugging strategies for LangChain applications.620621## Verbose Mode622623Enable detailed logging:624625```python626from langchain.globals import set_verbose627628set_verbose(True)629```630631This prints:632- Each step in the chain633- Inputs and outputs at every stage634- Tool calls and their results635636## LangSmith Tracing637638For production debugging, use LangSmith:639640```python641import os642os.environ["LANGCHAIN_TRACING_V2"] = "true"643os.environ["LANGCHAIN_API_KEY"] = "your-key"644```645646LangSmith provides:647- Visual trace of every step648- Latency breakdown649- Token usage tracking650- Failure analysis651652## Common Debugging Patterns653654### 1. Tool Selection Issues655656The agent picks the wrong tool. Debug by:657- Checking tool descriptions for clarity658- Reviewing the prompt format659- Testing with simplified tool sets660661### 2. Infinite Loops662663Agent repeats the same action. Fix by:664- Adding max_iterations limit665- Including progress checks in prompts666- Implementing early stopping conditions667668### 3. Context Loss669670Agent forgets earlier information. Address by:671- Checking context window limits672- Implementing conversation summarization673- Using retrieval for long-term memory674675### 4. Hallucination676677Agent makes up information. Reduce by:678- Requiring citations679- Validating outputs against sources680- Using temperature=0 for factual tasks681682## Trace Analysis683684The most powerful debugging technique is analyzing the full trace:6856861. Capture all inputs, outputs, and reasoning6872. Find the exact step where things went wrong6883. Identify the pattern (tool confusion, goal drift, etc.)6894. Update prompts to address the specific failure690""",691"https://arxiv.org/abs/2307.03172": """692# Lost in the Middle: How Language Models Use Long Contexts693694Liu et al., 2023695696## Abstract697698While large language models support increasingly long context windows, we find they struggle to effectively use information in the middle of long contexts. This "lost in the middle" phenomenon has important implications for RAG systems and context engineering.699700## Key Findings701702### 1. U-Shaped Performance Curve703704When relevant information is placed at different positions in a long context:705- **Beginning**: High performance (recency effect)706- **Middle**: Significantly degraded performance707- **End**: High performance (primacy effect)708709### 2. Performance Degrades with Context Length710711Even when information is at optimal positions, performance decreases as total context length increases.712713### 3. Model Size Doesn't Fix It714715Larger models show the same pattern. This is a fundamental limitation of current architectures.716717## Implications for Practitioners718719### Context Engineering Strategies7207211. **Place critical information at the start or end**722- Instructions at the beginning723- Task-specific context at the end7247252. **Keep context focused**726- Only include truly relevant information727- Remove redundant or low-signal content7287293. **Structure for attention**730- Use clear section headers731- Separate distinct topics732- Front-load important details in each section733734### RAG System Design7357361. **Limit retrieved chunks**737- Quality over quantity738- Rank by relevance, not just similarity7397402. **Position retrieved content strategically**741- Most relevant chunks at boundaries742- Less relevant in middle if needed7437443. **Consider summarization**745- Condense multiple sources746- Preserve key information density747""",748}749750# Simulated file system with realistic project structure751FILE_SYSTEM = {752"./project/README.md": """# AI Agent Research Project753754This project explores context engineering and agent optimization techniques.755756## Structure757- research/ - Research notes and findings758- output/ - Generated reports and summaries759- data/ - Source materials and datasets760761## Current Focus7621. Understanding context engineering principles7632. Exploring interleaved thinking for debugging7643. Developing prompt optimization strategies765766## Resources767- Anthropic Documentation: https://docs.anthropic.com768- OpenAI Guides: https://platform.openai.com/docs769- MiniMax M2.1: https://www.minimax.io770""",771"./project/research/notes.md": """# Research Notes772773## Context Engineering774775### Definition776Context engineering is the discipline of managing what information enters the AI model's context window. It goes beyond prompt engineering to consider:777- System prompts and instructions778- Tool definitions and descriptions779- Retrieved documents (RAG)780- Conversation history781- Tool outputs and intermediate results782783### Key Insight: "Lost in the Middle"784Research shows LLMs struggle with information in the middle of long contexts. Place important information at the start or end.785786### Best Practices7871. Quality over quantity - only include high-signal tokens7882. Structure matters - use clear formatting and hierarchies7893. Progressive disclosure - load information on-demand7904. Attention anchoring - place critical info at boundaries791792## Interleaved Thinking793794### What It Is795The ability for models to reason between tool calls, not just at the start.796797### Benefits798- Full visibility into agent reasoning799- Better debugging and error recovery800- Adaptive behavior based on tool results801802### MiniMax M2.1803- Implements interleaved thinking804- Exposes reasoning via `thinking` blocks805- Compatible with Anthropic SDK806807## Open Questions808- How to measure context efficiency?809- Optimal strategies for tool descriptions?810- Balancing context size vs. quality?811""",812"./project/research/references.md": """# References813814## Papers8151. "Lost in the Middle: How Language Models Use Long Contexts" - Liu et al., 20238162. "Chain-of-Thought Prompting Elicits Reasoning" - Wei et al., 2022817818## Documentation819- Anthropic: https://docs.anthropic.com/en/docs820- OpenAI: https://platform.openai.com/docs821- MiniMax: https://www.minimax.io/platform/docs822823## Guides824- Prompt Engineering Guide: https://www.promptingguide.ai825- LangChain Debugging: https://python.langchain.com/docs/how_to/debugging826""",827}828829# Runtime state830saved_notes = []831written_files = {}832833834# =============================================================================835# TOOL EXECUTOR836# =============================================================================837838def execute_tool(name: str, input_data: dict) -> str:839"""Execute a tool and return realistic results."""840global saved_notes, written_files841842if name == "web_search":843query = input_data.get("query", "").lower()844num_results = min(input_data.get("num_results", 5), 10)845846# Find matching results847results = []848for key, items in SEARCH_DATABASE.items():849# Check if any query words match the key850query_words = set(query.split())851key_words = set(key.split())852if query_words & key_words: # Intersection853results.extend(items)854855# Deduplicate and limit856seen_urls = set()857unique_results = []858for r in results:859if r["url"] not in seen_urls:860seen_urls.add(r["url"])861unique_results.append(r)862863if not unique_results:864# Return generic "no results" response865return json.dumps({866"query": query,867"num_results": 0,868"results": [],869"message": "No results found. Try different keywords.",870})871872return json.dumps({873"query": query,874"num_results": len(unique_results[:num_results]),875"results": unique_results[:num_results],876})877878elif name == "read_url":879url = input_data.get("url", "")880content = PAGE_CONTENT.get(url)881882if content:883return json.dumps({884"url": url,885"status": "success",886"content": content,887"length": len(content),888})889else:890return json.dumps({891"url": url,892"status": "error",893"error": "Page not found or unable to fetch content",894})895896elif name == "read_file":897path = input_data.get("path", "")898899# Check mock file system first900if path in FILE_SYSTEM:901return json.dumps({902"path": path,903"status": "success",904"content": FILE_SYSTEM[path],905})906907# Check written files908if path in written_files:909return json.dumps({910"path": path,911"status": "success",912"content": written_files[path],913})914915return json.dumps({916"path": path,917"status": "error",918"error": f"File not found: {path}",919})920921elif name == "write_file":922path = input_data.get("path", "")923content = input_data.get("content", "")924925written_files[path] = content926return json.dumps({927"path": path,928"status": "success",929"message": f"Successfully wrote {len(content)} characters to {path}",930})931932elif name == "list_directory":933path = input_data.get("path", ".")934935# Simulate directory listing based on mock file system936if path == "." or path == "./project":937return json.dumps({938"path": path,939"entries": [940{"name": "README.md", "type": "file"},941{"name": "research", "type": "directory"},942{"name": "output", "type": "directory"},943{"name": "data", "type": "directory"},944],945})946elif path == "./project/research" or path == "research":947return json.dumps({948"path": path,949"entries": [950{"name": "notes.md", "type": "file"},951{"name": "references.md", "type": "file"},952],953})954else:955return json.dumps({956"path": path,957"entries": [],958"message": "Directory is empty or does not exist",959})960961elif name == "save_note":962note = {963"id": len(saved_notes) + 1,964"title": input_data.get("title", "Untitled"),965"content": input_data.get("content", ""),966"tags": input_data.get("tags", []),967"timestamp": datetime.now().isoformat(),968}969saved_notes.append(note)970return json.dumps({971"status": "success",972"note_id": note["id"],973"message": f"Note '{note['title']}' saved successfully",974})975976elif name == "calculator":977expression = input_data.get("expression", "")978try:979# Safe evaluation of mathematical expressions980import math981allowed_names = {982"sqrt": math.sqrt,983"sin": math.sin,984"cos": math.cos,985"tan": math.tan,986"log": math.log,987"log10": math.log10,988"exp": math.exp,989"pow": pow,990"abs": abs,991"round": round,992"pi": math.pi,993"e": math.e,994}995result = eval(expression, {"__builtins__": {}}, allowed_names)996return json.dumps({997"expression": expression,998"result": result,999"status": "success",1000})1001except Exception as e:1002return json.dumps({1003"expression": expression,1004"status": "error",1005"error": str(e),1006})10071008return json.dumps({"error": f"Unknown tool: {name}"})100910101011# =============================================================================1012# MAIN OPTIMIZATION LOOP1013# =============================================================================10141015def main():1016"""Run the full optimization loop with comprehensive tools."""1017global saved_notes, written_files10181019# Reset state1020saved_notes = []1021written_files = {}10221023# Configuration for optimization1024# Note: Complex research tasks typically plateau around 65-75 scores1025# due to inherent variability in multi-tool reasoning chains1026config = LoopConfig(1027max_iterations=5, # Usually converges within 3-5 iterations1028convergence_threshold=3.0, # Stop when improvements become marginal1029min_score_threshold=75.0, # Realistic target for complex research tasks1030regression_threshold=8.0, # Detect significant score drops1031use_best_prompt=True, # Always use the best-performing prompt1032max_prompt_growth=5.0, # Prevent excessive prompt bloat1033save_artifacts=True,1034artifacts_dir="./optimization_artifacts",1035verbose=True,1036)10371038# Initialize the optimization loop1039loop = OptimizationLoop(1040config=config,1041api_key=os.getenv("ANTHROPIC_API_KEY"),1042base_url="https://api.minimax.io/anthropic",1043model="MiniMax-M2.1",1044)10451046# Complex research task requiring multiple tools1047task = """Research the topic of "context engineering for AI agents" and create a comprehensive summary.10481049Your research should:10501. Search for information about context engineering concepts and best practices10512. Read relevant sources to gather detailed information10523. Check the local project files for any existing research notes10534. Save important findings as notes for future reference10545. Write a final summary report to ./output/research_summary.md10551056The summary should include:1057- Key concepts and definitions1058- Best practices and techniques (including the "lost in the middle" problem)1059- Practical recommendations for agent developers1060- References to sources consulted (use actual URLs from your research)"""10611062# Intentionally weak initial prompt to show optimization improvement1063initial_prompt = """You are a research assistant. Help with research tasks using the available tools."""10641065print("=" * 70)1066print("COMPREHENSIVE OPTIMIZATION LOOP DEMONSTRATION")1067print("=" * 70)1068print(f"\nTask:\n{task}")1069print(f"\nInitial (weak) prompt:\n{initial_prompt}")1070print(f"\nTools available: {', '.join(t['name'] for t in TOOLS)}")1071print("\n" + "=" * 70)1072print("Starting optimization loop...")1073print("=" * 70)10741075# Run the optimization loop1076result = loop.run(1077task=task,1078initial_prompt=initial_prompt,1079tools=TOOLS,1080tool_executor=execute_tool,1081)10821083# Show results1084print("\n" + "=" * 70)1085print("OPTIMIZATION RESULTS")1086print("=" * 70)10871088print(f"\nTotal Iterations: {result.total_iterations}")1089print(f"Converged: {result.converged}")1090print(f"Score Improvement: {result.initial_score:.1f} → {result.final_score:.1f} ({result.improvement_percentage:+.1f}%)")10911092print("\n" + "=" * 70)1093print("ITERATION DETAILS")1094print("=" * 70)10951096for iteration in result.iterations:1097print(f"\n{'─' * 50}")1098print(f"ITERATION {iteration.iteration}")1099print(f"{'─' * 50}")1100print(f"Task Completed: {iteration.task_completed}")1101print(f"Score: {iteration.analysis.overall_score:.1f}/100")1102print(f"Patterns Found: {len(iteration.analysis.patterns)}")1103print(f"Tool Calls Made: {len(iteration.trace.tool_calls)}")1104print(f"Thinking Blocks: {len(iteration.trace.thinking_blocks)}")11051106if iteration.analysis.patterns:1107print("\nDetected Patterns:")1108for p in iteration.analysis.patterns:1109print(f" [{p.severity.value.upper()}] {p.type.value}")1110print(f" {p.description[:80]}...")1111print(f" Suggestion: {p.suggestion[:80]}...")11121113if iteration.analysis.strengths:1114print("\nStrengths:")1115for s in iteration.analysis.strengths[:3]:1116print(f" + {s[:80]}...")11171118if iteration.analysis.weaknesses:1119print("\nWeaknesses:")1120for w in iteration.analysis.weaknesses[:3]:1121print(f" - {w[:80]}...")11221123if iteration.optimization and iteration.optimization.key_changes:1124print("\nKey Changes Applied:")1125for change in iteration.optimization.key_changes[:3]:1126print(f" • {change[:80]}...")11271128print("\n" + "=" * 70)1129print("FINAL OPTIMIZED PROMPT")1130print("=" * 70)1131print(result.final_prompt)11321133# Show tool usage summary1134print("\n" + "=" * 70)1135print("TOOL USAGE ACROSS ALL ITERATIONS")1136print("=" * 70)11371138tool_usage = {}1139for iteration in result.iterations:1140for tc in iteration.trace.tool_calls:1141tool_usage[tc.name] = tool_usage.get(tc.name, 0) + 111421143for tool_name, count in sorted(tool_usage.items(), key=lambda x: -x[1]):1144print(f" {tool_name}: {count} calls")11451146# Show saved notes1147if saved_notes:1148print("\n" + "=" * 70)1149print("NOTES SAVED DURING RESEARCH")1150print("=" * 70)1151for note in saved_notes:1152print(f"\n[{note['id']}] {note['title']}")1153if note['tags']:1154print(f" Tags: {', '.join(note['tags'])}")1155print(f" {note['content'][:150]}...")11561157# Show written files1158if written_files:1159print("\n" + "=" * 70)1160print("FILES WRITTEN DURING RESEARCH")1161print("=" * 70)1162for path, content in written_files.items():1163print(f"\n{path} ({len(content)} chars)")1164print(f" Preview: {content[:200]}...")11651166# Generate a shareable skill1167print("\n" + "=" * 70)1168print("GENERATING SHAREABLE SKILL")1169print("=" * 70)11701171generator = SkillGenerator(1172api_key=os.getenv("ANTHROPIC_API_KEY"),1173base_url="https://api.minimax.io/anthropic",1174model="MiniMax-M2.1",1175)11761177skill_path = generator.generate(1178result=result,1179skill_name="comprehensive-research-agent",1180output_dir="./generated_skills",1181title="Comprehensive Research Agent Best Practices",1182)11831184print(f"\nGenerated skill at: {skill_path}")1185print("\nThis skill captures the learnings from optimization and can be shared")1186print("with other developers to improve their research agents!")11871188# Final summary1189print("\n" + "=" * 70)1190print("SUMMARY")1191print("=" * 70)1192print(f"""1193The optimization loop demonstrated:119411951. INTERLEAVED THINKING1196- {sum(len(i.trace.thinking_blocks) for i in result.iterations)} thinking blocks captured across {result.total_iterations} iterations1197- Full visibility into agent reasoning between tool calls119811992. PATTERN DETECTION1200- Identified patterns: {', '.join(set(p.type.value for i in result.iterations for p in i.analysis.patterns)) or 'None'}1201- Each pattern includes evidence and suggestions120212033. PROMPT OPTIMIZATION1204- Initial score: {result.initial_score:.1f}1205- Final score: {result.final_score:.1f}1206- Improvement: {result.improvement_percentage:+.1f}%120712084. SKILL GENERATION1209- Created shareable skill at: {skill_path}1210- Captures learnings for other developers121112125. REAL-WORLD URLS USED1213- Anthropic: docs.anthropic.com1214- OpenAI: platform.openai.com1215- MiniMax: minimax.io1216- DAIR.AI: promptingguide.ai1217- Research papers: arxiv.org1218""")121912201221if __name__ == "__main__":1222main()1223