Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
241
Skill
n/a
Size
2.6 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
examples/interleaved-thinking/examples/03_full_optimization.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code1223 linesFree
examples/interleaved-thinking/examples/03_full_optimization.py
1"""
2Example 3: Full Optimization Loop with Comprehensive Tools
3 
4Demonstrates the complete optimization cycle with realistic tools:
5- Web search for finding information
6- URL reading for fetching content
7- File system operations (read, write, list)
8- Note-taking for tracking findings
9 
10This example uses REAL URLs and realistic content to demonstrate
11how the Reasoning Trace Optimizer works in production scenarios.
12"""
13 
14import json
15import os
16import random
17from datetime import datetime
18from pathlib import Path
19 
20from dotenv import load_dotenv
21 
22from reasoning_trace_optimizer import (
23    OptimizationLoop,
24    LoopConfig,
25    SkillGenerator,
26)
27 
28# Load environment variables from the project root
29env_path = Path(__file__).parent.parent / ".env"
30load_dotenv(env_path)
31 
32 
33# =============================================================================
34# COMPREHENSIVE TOOL DEFINITIONS
35# =============================================================================
36 
37TOOLS = [
38    # Web Search Tool
39    {
40        "name": "web_search",
41        "description": "Search the web for information. Returns a list of results with titles, URLs, and snippets. Use specific queries for better results.",
42        "input_schema": {
43            "type": "object",
44            "properties": {
45                "query": {
46                    "type": "string",
47                    "description": "Search query - be specific and use relevant keywords",
48                },
49                "num_results": {
50                    "type": "integer",
51                    "description": "Number of results to return (1-10, default 5)",
52                    "default": 5,
53                },
54            },
55            "required": ["query"],
56        },
57    },
58    # Read URL Tool
59    {
60        "name": "read_url",
61        "description": "Fetch and read the content of a webpage. Returns the main text content. Use after web_search to get full details from a result.",
62        "input_schema": {
63            "type": "object",
64            "properties": {
65                "url": {
66                    "type": "string",
67                    "description": "The URL to fetch content from",
68                },
69            },
70            "required": ["url"],
71        },
72    },
73    # File Read Tool
74    {
75        "name": "read_file",
76        "description": "Read the contents of a local file. Supports text files, markdown, JSON, etc.",
77        "input_schema": {
78            "type": "object",
79            "properties": {
80                "path": {
81                    "type": "string",
82                    "description": "Path to the file to read",
83                },
84            },
85            "required": ["path"],
86        },
87    },
88    # File Write Tool
89    {
90        "name": "write_file",
91        "description": "Write content to a local file. Creates the file if it doesn't exist, overwrites if it does.",
92        "input_schema": {
93            "type": "object",
94            "properties": {
95                "path": {
96                    "type": "string",
97                    "description": "Path where to write the file",
98                },
99                "content": {
100                    "type": "string",
101                    "description": "Content to write to the file",
102                },
103            },
104            "required": ["path", "content"],
105        },
106    },
107    # List Directory Tool
108    {
109        "name": "list_directory",
110        "description": "List files and folders in a directory. Useful for exploring project structure.",
111        "input_schema": {
112            "type": "object",
113            "properties": {
114                "path": {
115                    "type": "string",
116                    "description": "Directory path to list (default: current directory)",
117                    "default": ".",
118                },
119            },
120            "required": [],
121        },
122    },
123    # Save Note Tool
124    {
125        "name": "save_note",
126        "description": "Save a research note with title and content. Use to track important findings during research.",
127        "input_schema": {
128            "type": "object",
129            "properties": {
130                "title": {
131                    "type": "string",
132                    "description": "Title of the note",
133                },
134                "content": {
135                    "type": "string",
136                    "description": "Content of the note",
137                },
138                "tags": {
139                    "type": "array",
140                    "items": {"type": "string"},
141                    "description": "Optional tags for categorization",
142                },
143            },
144            "required": ["title", "content"],
145        },
146    },
147    # Calculator Tool
148    {
149        "name": "calculator",
150        "description": "Perform mathematical calculations. Supports basic arithmetic and common functions.",
151        "input_schema": {
152            "type": "object",
153            "properties": {
154                "expression": {
155                    "type": "string",
156                    "description": "Mathematical expression to evaluate (e.g., '2 + 2', 'sqrt(16)', '100 * 0.15')",
157                },
158            },
159            "required": ["expression"],
160        },
161    },
162]
163 
164 
165# =============================================================================
166# REAL-WORLD SIMULATED DATA
167# Based on actual documentation and research from AI companies
168# =============================================================================
169 
170# Simulated web search results with REAL URLs
171SEARCH_DATABASE = {
172    "context engineering ai": [
173        {
174            "title": "Context Engineering for AI Agents - Anthropic",
175            "url": "https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
176            "snippet": "Prompt caching is a feature that optimizes API usage by allowing resuming from specific prefixes in your prompts. Cache the context you want to reuse across requests.",
177        },
178        {
179            "title": "Building Effective AI Agents - Anthropic Research",
180            "url": "https://www.anthropic.com/research/building-effective-agents",
181            "snippet": "A comprehensive guide to building effective AI agents. Covers tool use, context management, error handling, and best practices for production deployments.",
182        },
183        {
184            "title": "Large Language Models and Context Windows - OpenAI",
185            "url": "https://platform.openai.com/docs/guides/text-generation",
186            "snippet": "Understanding how context windows work in large language models. Learn about token limits, context management strategies, and optimizing for performance.",
187        },
188    ],
189    "interleaved thinking agents": [
190        {
191            "title": "MiniMax M2.1 - Interleaved Thinking Model",
192            "url": "https://www.minimax.io/platform/docs/M2.1",
193            "snippet": "M2.1 introduces interleaved thinking - the ability for models to reason between tool calls, enabling better debugging and adaptability in agentic workflows.",
194        },
195        {
196            "title": "Chain of Thought Prompting - Google Research",
197            "url": "https://arxiv.org/abs/2201.11903",
198            "snippet": "Chain-of-thought prompting enables complex reasoning in large language models. This paper explores how step-by-step reasoning improves model performance.",
199        },
200    ],
201    "prompt optimization techniques": [
202        {
203            "title": "Prompt Engineering Guide - DAIR.AI",
204            "url": "https://www.promptingguide.ai/techniques",
205            "snippet": "Comprehensive guide to prompt engineering techniques including zero-shot, few-shot, chain-of-thought, and advanced methods for optimizing LLM outputs.",
206        },
207        {
208            "title": "Best Practices for Prompt Engineering - OpenAI",
209            "url": "https://platform.openai.com/docs/guides/prompt-engineering",
210            "snippet": "Official OpenAI guide on prompt engineering best practices. Covers strategies for getting better results, handling edge cases, and iterative refinement.",
211        },
212    ],
213    "agent debugging best practices": [
214        {
215            "title": "Debugging AI Agents - LangChain Documentation",
216            "url": "https://python.langchain.com/docs/how_to/debugging",
217            "snippet": "Learn how to debug LangChain agents effectively. Covers tracing, verbose mode, callbacks, and common debugging patterns for complex agent workflows.",
218        },
219        {
220            "title": "LLM Observability and Tracing - Weights & Biases",
221            "url": "https://docs.wandb.ai/guides/prompts",
222            "snippet": "Track and debug LLM applications with W&B Prompts. Visualize chains, compare outputs, and identify failure patterns in your AI applications.",
223        },
224    ],
225    "context window optimization": [
226        {
227            "title": "Claude's Context Window - Anthropic Documentation",
228            "url": "https://docs.anthropic.com/en/docs/build-with-claude/context-windows",
229            "snippet": "Claude supports context windows up to 200K tokens. Learn how to effectively use large context windows and optimize token usage for cost and performance.",
230        },
231        {
232            "title": "Lost in the Middle: How Language Models Use Long Contexts",
233            "url": "https://arxiv.org/abs/2307.03172",
234            "snippet": "Research on how LLMs utilize information across long contexts. Models perform worse when relevant info is in the middle vs. beginning/end of context.",
235        },
236    ],
237}
238 
239# Simulated webpage content based on REAL documentation
240PAGE_CONTENT = {
241    "https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching": """
242# Prompt Caching - Anthropic Documentation
243 
244Prompt caching is a feature that optimizes API usage by allowing you to cache frequently used context.
245 
246## Overview
247 
248Prompt caching allows you to cache the system prompt, examples, and other static content that remains constant across multiple requests. This:
249 
250- **Reduces latency** by up to 85% for cached content
251- **Lowers costs** by avoiding re-processing of identical context
252- **Improves throughput** for high-volume applications
253 
254## How It Works
255 
256When you enable prompt caching, the API stores a hash of your prompt prefix. On subsequent requests with the same prefix, the cached computation is reused.
257 
258### Cache Breakpoints
259 
260You can specify cache breakpoints using the `cache_control` parameter:
261 
262```python
263messages = [
264    {
265        "role": "user",
266        "content": [
267            {
268                "type": "text",
269                "text": "Your static context here...",
270                "cache_control": {"type": "ephemeral"}
271            }
272        ]
273    }
274]
275```
276 
277## Best Practices
278 
2791. **Cache stable content**: Put instructions and examples that don't change in the cached portion
2802. **Place dynamic content last**: User queries and variable data should come after cached content
2813. **Monitor cache hits**: Use the response headers to track cache efficiency
2824. **Minimum cache size**: Content must be at least 1024 tokens to be cached
283 
284## Context Engineering Implications
285 
286Effective prompt caching is a key part of context engineering. By understanding what to cache:
287 
288- System prompts with role definitions
289- Tool descriptions that remain constant
290- Few-shot examples for consistent behavior
291- Reference documentation the model needs
292 
293You reduce both latency and cost while maintaining quality.
294""",
295    "https://www.anthropic.com/research/building-effective-agents": """
296# Building Effective AI Agents - Anthropic Research
297 
298This guide covers best practices for building reliable, effective AI agents using Claude.
299 
300## Core Principles
301 
302### 1. Start Simple, Add Complexity Gradually
303 
304Begin with the simplest possible agent architecture:
305- Single tool with clear purpose
306- Linear workflow without branching
307- Explicit success criteria
308 
309Only add complexity when you have evidence it's needed.
310 
311### 2. Tool Design Matters
312 
313Well-designed tools make agents more reliable:
314 
315- **Clear descriptions**: Explain what the tool does AND when to use it
316- **Typed inputs**: Use JSON Schema to define expected parameters
317- **Informative outputs**: Return data the model can interpret and act on
318- **Error messages**: Provide actionable guidance when things fail
319 
320### 3. Context Management
321 
322Context is your most precious resource:
323 
324- **Token efficiency**: Every token costs money and attention
325- **Structured format**: Use consistent formatting for easier parsing
326- **Progressive disclosure**: Load information on-demand
327- **Summarization**: Compress long histories while preserving key facts
328 
329### 4. Error Handling
330 
331Agents will encounter errors. Design for recovery:
332 
333- Give the model explicit permission to retry
334- Provide diagnostic information in error messages
335- Set clear stopping conditions to prevent infinite loops
336- Log everything for debugging
337 
338## Common Anti-Patterns
339 
3401. **Over-engineering**: Building complex multi-agent systems before validating single-agent performance
3412. **Vague tools**: Tool descriptions that don't clarify when to use each tool
3423. **Context overload**: Stuffing too much information into the prompt
3434. **No exit conditions**: Letting agents run indefinitely without progress checks
344 
345## Debugging Strategies
346 
347### Trace Analysis
348 
349The key to debugging agents is understanding their reasoning:
350 
3511. Capture the full reasoning trace including thinking blocks
3522. Identify where the agent's understanding diverged from reality
3533. Look for patterns: tool confusion, goal drift, context loss
3544. Iterate on prompts based on specific failure modes
355 
356### Interleaved Thinking
357 
358Models with interleaved thinking (reasoning between tool calls) provide better debugging insight because you can see:
359 
360- How they interpreted each tool result
361- What alternatives they considered
362- When and why they changed approach
363""",
364    "https://platform.openai.com/docs/guides/text-generation": """
365# Text Generation - OpenAI Documentation
366 
367Learn how to generate text with OpenAI's models.
368 
369## Context Windows
370 
371Each model has a context window that determines the maximum number of tokens it can process:
372 
373| Model | Context Window |
374|-------|----------------|
375| GPT-4o | 128K tokens |
376| GPT-4 Turbo | 128K tokens |
377| GPT-3.5 Turbo | 16K tokens |
378 
379### Managing Context
380 
381For long conversations or documents:
382 
3831. **Truncation**: Remove oldest messages when approaching the limit
3842. **Summarization**: Replace old messages with summaries
3853. **Retrieval**: Use RAG to fetch only relevant content
386 
387### Token Counting
388 
389Use the tiktoken library to count tokens before sending requests:
390 
391```python
392import tiktoken
393 
394encoding = tiktoken.encoding_for_model("gpt-4")
395num_tokens = len(encoding.encode("Your text here"))
396```
397 
398## Best Practices
399 
400### Structured Prompts
401 
402Organize your prompts with clear sections:
403- System message: Role and general instructions
404- Context: Background information needed
405- Task: Specific request with format requirements
406- Examples: Few-shot demonstrations if helpful
407 
408### Temperature and Sampling
409 
410- **temperature=0**: Deterministic, best for factual tasks
411- **temperature=0.7**: Balanced creativity and coherence
412- **temperature=1.0+**: More random, for creative tasks
413""",
414    "https://www.minimax.io/platform/docs/M2.1": """
415# MiniMax M2.1 - Interleaved Thinking Model
416 
417M2.1 is a next-generation reasoning model that introduces **interleaved thinking** - continuous reasoning throughout task execution.
418 
419## What is Interleaved Thinking?
420 
421Traditional reasoning models think once at the start, then execute:
422```
423Think → Act → Act → Act → Done
424```
425 
426M2.1 thinks between every action:
427```
428Think → Act → Think → Act → Think → Act → Done
429```
430 
431## Why This Matters
432 
433### 1. Better Debugging
434 
435The thinking blocks expose the model's reasoning process. You can see:
436- What it understood from tool results
437- How it decided what to do next
438- Where it might have gone wrong
439 
440### 2. Adaptive Behavior
441 
442By reasoning after each tool call, M2.1 can:
443- React to unexpected outputs
444- Recover from errors mid-execution
445- Adjust strategy based on new information
446 
447### 3. Long-Horizon Tasks
448 
449For complex multi-step tasks, maintaining focus is crucial. Interleaved thinking:
450- Reinforces the original goal
451- Tracks progress toward completion
452- Identifies when the task is done
453 
454## API Usage
455 
456### Anthropic SDK
457 
458```python
459import anthropic
460 
461client = anthropic.Anthropic(
462    api_key="your-key",
463    base_url="https://api.minimax.io/anthropic"
464)
465 
466response = client.messages.create(
467    model="MiniMax-M2.1",
468    max_tokens=4096,
469    messages=[{"role": "user", "content": "Your task"}]
470)
471 
472# Access thinking blocks
473for block in response.content:
474    if block.type == "thinking":
475        print(f"Thinking: {block.thinking}")
476    elif block.type == "text":
477        print(f"Response: {block.text}")
478```
479 
480## Best Practices
481 
4821. **Preserve full context**: Always include thinking blocks in message history
4832. **Clear tool descriptions**: Help the model understand when to use each tool
4843. **Explicit success criteria**: Define what "done" looks like
4854. **Error guidance**: Give clear instructions for handling failures
486""",
487    "https://www.promptingguide.ai/techniques": """
488# Prompt Engineering Techniques - DAIR.AI
489 
490A comprehensive guide to prompt engineering techniques for large language models.
491 
492## Basic Techniques
493 
494### Zero-Shot Prompting
495 
496Ask the model to perform a task without examples:
497 
498```
499Classify this text as positive, negative, or neutral:
500"I really enjoyed the movie but the ending was disappointing."
501```
502 
503### Few-Shot Prompting
504 
505Provide examples to guide the model:
506 
507```
508Classify sentiment:
509"Great product!" → Positive
510"Terrible service." → Negative
511"It was okay." → Neutral
512"I really enjoyed the movie but the ending was disappointing." →
513```
514 
515## Advanced Techniques
516 
517### Chain-of-Thought (CoT)
518 
519Encourage step-by-step reasoning:
520 
521```
522Solve this problem step by step:
523If John has 5 apples and gives 2 to Mary, then buys 3 more, how many does he have?
524 
525Let's think through this:
5261. John starts with 5 apples
5272. He gives 2 to Mary: 5 - 2 = 3 apples
5283. He buys 3 more: 3 + 3 = 6 apples
529Answer: 6 apples
530```
531 
532### Self-Consistency
533 
534Generate multiple reasoning paths and take the majority answer. Improves reliability for complex reasoning tasks.
535 
536### Tree of Thoughts
537 
538Explore multiple reasoning branches simultaneously, evaluating and pruning paths to find optimal solutions.
539 
540## Prompt Optimization
541 
542### Iterative Refinement
543 
5441. Start with a basic prompt
5452. Test on representative examples
5463. Analyze failures
5474. Refine prompt based on patterns
5485. Repeat until convergence
549 
550### Common Failure Patterns
551 
552| Pattern | Solution |
553|---------|----------|
554| Goal drift | Add explicit goal reminders |
555| Hallucination | Require source citations |
556| Incomplete output | Specify format requirements |
557| Wrong tool usage | Improve tool descriptions |
558""",
559    "https://platform.openai.com/docs/guides/prompt-engineering": """
560# Prompt Engineering Best Practices - OpenAI
561 
562Official guide to getting better results from large language models.
563 
564## Six Strategies
565 
566### 1. Write Clear Instructions
567 
568Be specific about what you want:
569- Include details in your query
570- Ask the model to adopt a persona
571- Use delimiters to mark distinct sections
572- Specify desired output format and length
573 
574### 2. Provide Reference Text
575 
576Reduce hallucinations:
577- Instruct the model to answer using provided text
578- Ask for citations from the source material
579- Use retrieval to inject relevant context
580 
581### 3. Split Complex Tasks
582 
583Break down hard problems:
584- Use intent classification to route queries
585- Summarize long documents in chunks
586- Break multi-step tasks into sequential prompts
587 
588### 4. Give the Model Time to Think
589 
590Improve reasoning:
591- Ask for a chain of reasoning
592- Use inner monologue to hide intermediate steps
593- Ask if previous steps were correct
594 
595### 5. Use External Tools
596 
597Augment model capabilities:
598- Use code execution for accurate calculations
599- Use retrieval for up-to-date information
600- Use APIs for specific functionality
601 
602### 6. Test Changes Systematically
603 
604Evaluate prompt effectiveness:
605- Define comprehensive test cases
606- Measure against gold-standard answers
607- Track metrics over prompt iterations
608 
609## Anti-Patterns to Avoid
610 
6111. **Ambiguous instructions**: "Make it better" vs "Improve clarity by adding examples"
6122. **Too much context**: Relevant info gets lost in noise
6133. **No output format**: Model guesses what you want
6144. **Assuming knowledge**: Model doesn't know your codebase/domain
615""",
616    "https://python.langchain.com/docs/how_to/debugging": """
617# Debugging LangChain Agents
618 
619Learn effective debugging strategies for LangChain applications.
620 
621## Verbose Mode
622 
623Enable detailed logging:
624 
625```python
626from langchain.globals import set_verbose
627 
628set_verbose(True)
629```
630 
631This prints:
632- Each step in the chain
633- Inputs and outputs at every stage
634- Tool calls and their results
635 
636## LangSmith Tracing
637 
638For production debugging, use LangSmith:
639 
640```python
641import os
642os.environ["LANGCHAIN_TRACING_V2"] = "true"
643os.environ["LANGCHAIN_API_KEY"] = "your-key"
644```
645 
646LangSmith provides:
647- Visual trace of every step
648- Latency breakdown
649- Token usage tracking
650- Failure analysis
651 
652## Common Debugging Patterns
653 
654### 1. Tool Selection Issues
655 
656The agent picks the wrong tool. Debug by:
657- Checking tool descriptions for clarity
658- Reviewing the prompt format
659- Testing with simplified tool sets
660 
661### 2. Infinite Loops
662 
663Agent repeats the same action. Fix by:
664- Adding max_iterations limit
665- Including progress checks in prompts
666- Implementing early stopping conditions
667 
668### 3. Context Loss
669 
670Agent forgets earlier information. Address by:
671- Checking context window limits
672- Implementing conversation summarization
673- Using retrieval for long-term memory
674 
675### 4. Hallucination
676 
677Agent makes up information. Reduce by:
678- Requiring citations
679- Validating outputs against sources
680- Using temperature=0 for factual tasks
681 
682## Trace Analysis
683 
684The most powerful debugging technique is analyzing the full trace:
685 
6861. Capture all inputs, outputs, and reasoning
6872. Find the exact step where things went wrong
6883. Identify the pattern (tool confusion, goal drift, etc.)
6894. Update prompts to address the specific failure
690""",
691    "https://arxiv.org/abs/2307.03172": """
692# Lost in the Middle: How Language Models Use Long Contexts
693 
694Liu et al., 2023
695 
696## Abstract
697 
698While large language models support increasingly long context windows, we find they struggle to effectively use information in the middle of long contexts. This "lost in the middle" phenomenon has important implications for RAG systems and context engineering.
699 
700## Key Findings
701 
702### 1. U-Shaped Performance Curve
703 
704When relevant information is placed at different positions in a long context:
705- **Beginning**: High performance (recency effect)
706- **Middle**: Significantly degraded performance
707- **End**: High performance (primacy effect)
708 
709### 2. Performance Degrades with Context Length
710 
711Even when information is at optimal positions, performance decreases as total context length increases.
712 
713### 3. Model Size Doesn't Fix It
714 
715Larger models show the same pattern. This is a fundamental limitation of current architectures.
716 
717## Implications for Practitioners
718 
719### Context Engineering Strategies
720 
7211. **Place critical information at the start or end**
722   - Instructions at the beginning
723   - Task-specific context at the end
724 
7252. **Keep context focused**
726   - Only include truly relevant information
727   - Remove redundant or low-signal content
728 
7293. **Structure for attention**
730   - Use clear section headers
731   - Separate distinct topics
732   - Front-load important details in each section
733 
734### RAG System Design
735 
7361. **Limit retrieved chunks**
737   - Quality over quantity
738   - Rank by relevance, not just similarity
739 
7402. **Position retrieved content strategically**
741   - Most relevant chunks at boundaries
742   - Less relevant in middle if needed
743 
7443. **Consider summarization**
745   - Condense multiple sources
746   - Preserve key information density
747""",
748}
749 
750# Simulated file system with realistic project structure
751FILE_SYSTEM = {
752    "./project/README.md": """# AI Agent Research Project
753 
754This project explores context engineering and agent optimization techniques.
755 
756## Structure
757- research/ - Research notes and findings
758- output/ - Generated reports and summaries
759- data/ - Source materials and datasets
760 
761## Current Focus
7621. Understanding context engineering principles
7632. Exploring interleaved thinking for debugging
7643. Developing prompt optimization strategies
765 
766## Resources
767- Anthropic Documentation: https://docs.anthropic.com
768- OpenAI Guides: https://platform.openai.com/docs
769- MiniMax M2.1: https://www.minimax.io
770""",
771    "./project/research/notes.md": """# Research Notes
772 
773## Context Engineering
774 
775### Definition
776Context engineering is the discipline of managing what information enters the AI model's context window. It goes beyond prompt engineering to consider:
777- System prompts and instructions
778- Tool definitions and descriptions
779- Retrieved documents (RAG)
780- Conversation history
781- Tool outputs and intermediate results
782 
783### Key Insight: "Lost in the Middle"
784Research shows LLMs struggle with information in the middle of long contexts. Place important information at the start or end.
785 
786### Best Practices
7871. Quality over quantity - only include high-signal tokens
7882. Structure matters - use clear formatting and hierarchies
7893. Progressive disclosure - load information on-demand
7904. Attention anchoring - place critical info at boundaries
791 
792## Interleaved Thinking
793 
794### What It Is
795The ability for models to reason between tool calls, not just at the start.
796 
797### Benefits
798- Full visibility into agent reasoning
799- Better debugging and error recovery
800- Adaptive behavior based on tool results
801 
802### MiniMax M2.1
803- Implements interleaved thinking
804- Exposes reasoning via `thinking` blocks
805- Compatible with Anthropic SDK
806 
807## Open Questions
808- How to measure context efficiency?
809- Optimal strategies for tool descriptions?
810- Balancing context size vs. quality?
811""",
812    "./project/research/references.md": """# References
813 
814## Papers
8151. "Lost in the Middle: How Language Models Use Long Contexts" - Liu et al., 2023
8162. "Chain-of-Thought Prompting Elicits Reasoning" - Wei et al., 2022
817 
818## Documentation
819- Anthropic: https://docs.anthropic.com/en/docs
820- OpenAI: https://platform.openai.com/docs
821- MiniMax: https://www.minimax.io/platform/docs
822 
823## Guides
824- Prompt Engineering Guide: https://www.promptingguide.ai
825- LangChain Debugging: https://python.langchain.com/docs/how_to/debugging
826""",
827}
828 
829# Runtime state
830saved_notes = []
831written_files = {}
832 
833 
834# =============================================================================
835# TOOL EXECUTOR
836# =============================================================================
837 
838def execute_tool(name: str, input_data: dict) -> str:
839    """Execute a tool and return realistic results."""
840    global saved_notes, written_files
841 
842    if name == "web_search":
843        query = input_data.get("query", "").lower()
844        num_results = min(input_data.get("num_results", 5), 10)
845 
846        # Find matching results
847        results = []
848        for key, items in SEARCH_DATABASE.items():
849            # Check if any query words match the key
850            query_words = set(query.split())
851            key_words = set(key.split())
852            if query_words & key_words:  # Intersection
853                results.extend(items)
854 
855        # Deduplicate and limit
856        seen_urls = set()
857        unique_results = []
858        for r in results:
859            if r["url"] not in seen_urls:
860                seen_urls.add(r["url"])
861                unique_results.append(r)
862 
863        if not unique_results:
864            # Return generic "no results" response
865            return json.dumps({
866                "query": query,
867                "num_results": 0,
868                "results": [],
869                "message": "No results found. Try different keywords.",
870            })
871 
872        return json.dumps({
873            "query": query,
874            "num_results": len(unique_results[:num_results]),
875            "results": unique_results[:num_results],
876        })
877 
878    elif name == "read_url":
879        url = input_data.get("url", "")
880        content = PAGE_CONTENT.get(url)
881 
882        if content:
883            return json.dumps({
884                "url": url,
885                "status": "success",
886                "content": content,
887                "length": len(content),
888            })
889        else:
890            return json.dumps({
891                "url": url,
892                "status": "error",
893                "error": "Page not found or unable to fetch content",
894            })
895 
896    elif name == "read_file":
897        path = input_data.get("path", "")
898 
899        # Check mock file system first
900        if path in FILE_SYSTEM:
901            return json.dumps({
902                "path": path,
903                "status": "success",
904                "content": FILE_SYSTEM[path],
905            })
906 
907        # Check written files
908        if path in written_files:
909            return json.dumps({
910                "path": path,
911                "status": "success",
912                "content": written_files[path],
913            })
914 
915        return json.dumps({
916            "path": path,
917            "status": "error",
918            "error": f"File not found: {path}",
919        })
920 
921    elif name == "write_file":
922        path = input_data.get("path", "")
923        content = input_data.get("content", "")
924 
925        written_files[path] = content
926        return json.dumps({
927            "path": path,
928            "status": "success",
929            "message": f"Successfully wrote {len(content)} characters to {path}",
930        })
931 
932    elif name == "list_directory":
933        path = input_data.get("path", ".")
934 
935        # Simulate directory listing based on mock file system
936        if path == "." or path == "./project":
937            return json.dumps({
938                "path": path,
939                "entries": [
940                    {"name": "README.md", "type": "file"},
941                    {"name": "research", "type": "directory"},
942                    {"name": "output", "type": "directory"},
943                    {"name": "data", "type": "directory"},
944                ],
945            })
946        elif path == "./project/research" or path == "research":
947            return json.dumps({
948                "path": path,
949                "entries": [
950                    {"name": "notes.md", "type": "file"},
951                    {"name": "references.md", "type": "file"},
952                ],
953            })
954        else:
955            return json.dumps({
956                "path": path,
957                "entries": [],
958                "message": "Directory is empty or does not exist",
959            })
960 
961    elif name == "save_note":
962        note = {
963            "id": len(saved_notes) + 1,
964            "title": input_data.get("title", "Untitled"),
965            "content": input_data.get("content", ""),
966            "tags": input_data.get("tags", []),
967            "timestamp": datetime.now().isoformat(),
968        }
969        saved_notes.append(note)
970        return json.dumps({
971            "status": "success",
972            "note_id": note["id"],
973            "message": f"Note '{note['title']}' saved successfully",
974        })
975 
976    elif name == "calculator":
977        expression = input_data.get("expression", "")
978        try:
979            # Safe evaluation of mathematical expressions
980            import math
981            allowed_names = {
982                "sqrt": math.sqrt,
983                "sin": math.sin,
984                "cos": math.cos,
985                "tan": math.tan,
986                "log": math.log,
987                "log10": math.log10,
988                "exp": math.exp,
989                "pow": pow,
990                "abs": abs,
991                "round": round,
992                "pi": math.pi,
993                "e": math.e,
994            }
995            result = eval(expression, {"__builtins__": {}}, allowed_names)
996            return json.dumps({
997                "expression": expression,
998                "result": result,
999                "status": "success",
1000            })
1001        except Exception as e:
1002            return json.dumps({
1003                "expression": expression,
1004                "status": "error",
1005                "error": str(e),
1006            })
1007 
1008    return json.dumps({"error": f"Unknown tool: {name}"})
1009 
1010 
1011# =============================================================================
1012# MAIN OPTIMIZATION LOOP
1013# =============================================================================
1014 
1015def main():
1016    """Run the full optimization loop with comprehensive tools."""
1017    global saved_notes, written_files
1018 
1019    # Reset state
1020    saved_notes = []
1021    written_files = {}
1022 
1023    # Configuration for optimization
1024    # Note: Complex research tasks typically plateau around 65-75 scores
1025    # due to inherent variability in multi-tool reasoning chains
1026    config = LoopConfig(
1027        max_iterations=5,  # Usually converges within 3-5 iterations
1028        convergence_threshold=3.0,  # Stop when improvements become marginal
1029        min_score_threshold=75.0,  # Realistic target for complex research tasks
1030        regression_threshold=8.0,  # Detect significant score drops
1031        use_best_prompt=True,  # Always use the best-performing prompt
1032        max_prompt_growth=5.0,  # Prevent excessive prompt bloat
1033        save_artifacts=True,
1034        artifacts_dir="./optimization_artifacts",
1035        verbose=True,
1036    )
1037 
1038    # Initialize the optimization loop
1039    loop = OptimizationLoop(
1040        config=config,
1041        api_key=os.getenv("ANTHROPIC_API_KEY"),
1042        base_url="https://api.minimax.io/anthropic",
1043        model="MiniMax-M2.1",
1044    )
1045 
1046    # Complex research task requiring multiple tools
1047    task = """Research the topic of "context engineering for AI agents" and create a comprehensive summary.
1048 
1049Your research should:
10501. Search for information about context engineering concepts and best practices
10512. Read relevant sources to gather detailed information
10523. Check the local project files for any existing research notes
10534. Save important findings as notes for future reference
10545. Write a final summary report to ./output/research_summary.md
1055 
1056The summary should include:
1057- Key concepts and definitions
1058- Best practices and techniques (including the "lost in the middle" problem)
1059- Practical recommendations for agent developers
1060- References to sources consulted (use actual URLs from your research)"""
1061 
1062    # Intentionally weak initial prompt to show optimization improvement
1063    initial_prompt = """You are a research assistant. Help with research tasks using the available tools."""
1064 
1065    print("=" * 70)
1066    print("COMPREHENSIVE OPTIMIZATION LOOP DEMONSTRATION")
1067    print("=" * 70)
1068    print(f"\nTask:\n{task}")
1069    print(f"\nInitial (weak) prompt:\n{initial_prompt}")
1070    print(f"\nTools available: {', '.join(t['name'] for t in TOOLS)}")
1071    print("\n" + "=" * 70)
1072    print("Starting optimization loop...")
1073    print("=" * 70)
1074 
1075    # Run the optimization loop
1076    result = loop.run(
1077        task=task,
1078        initial_prompt=initial_prompt,
1079        tools=TOOLS,
1080        tool_executor=execute_tool,
1081    )
1082 
1083    # Show results
1084    print("\n" + "=" * 70)
1085    print("OPTIMIZATION RESULTS")
1086    print("=" * 70)
1087 
1088    print(f"\nTotal Iterations: {result.total_iterations}")
1089    print(f"Converged: {result.converged}")
1090    print(f"Score Improvement: {result.initial_score:.1f} → {result.final_score:.1f} ({result.improvement_percentage:+.1f}%)")
1091 
1092    print("\n" + "=" * 70)
1093    print("ITERATION DETAILS")
1094    print("=" * 70)
1095 
1096    for iteration in result.iterations:
1097        print(f"\n{'─' * 50}")
1098        print(f"ITERATION {iteration.iteration}")
1099        print(f"{'─' * 50}")
1100        print(f"Task Completed: {iteration.task_completed}")
1101        print(f"Score: {iteration.analysis.overall_score:.1f}/100")
1102        print(f"Patterns Found: {len(iteration.analysis.patterns)}")
1103        print(f"Tool Calls Made: {len(iteration.trace.tool_calls)}")
1104        print(f"Thinking Blocks: {len(iteration.trace.thinking_blocks)}")
1105 
1106        if iteration.analysis.patterns:
1107            print("\nDetected Patterns:")
1108            for p in iteration.analysis.patterns:
1109                print(f"  [{p.severity.value.upper()}] {p.type.value}")
1110                print(f"       {p.description[:80]}...")
1111                print(f"       Suggestion: {p.suggestion[:80]}...")
1112 
1113        if iteration.analysis.strengths:
1114            print("\nStrengths:")
1115            for s in iteration.analysis.strengths[:3]:
1116                print(f"  + {s[:80]}...")
1117 
1118        if iteration.analysis.weaknesses:
1119            print("\nWeaknesses:")
1120            for w in iteration.analysis.weaknesses[:3]:
1121                print(f"  - {w[:80]}...")
1122 
1123        if iteration.optimization and iteration.optimization.key_changes:
1124            print("\nKey Changes Applied:")
1125            for change in iteration.optimization.key_changes[:3]:
1126                print(f"  • {change[:80]}...")
1127 
1128    print("\n" + "=" * 70)
1129    print("FINAL OPTIMIZED PROMPT")
1130    print("=" * 70)
1131    print(result.final_prompt)
1132 
1133    # Show tool usage summary
1134    print("\n" + "=" * 70)
1135    print("TOOL USAGE ACROSS ALL ITERATIONS")
1136    print("=" * 70)
1137 
1138    tool_usage = {}
1139    for iteration in result.iterations:
1140        for tc in iteration.trace.tool_calls:
1141            tool_usage[tc.name] = tool_usage.get(tc.name, 0) + 1
1142 
1143    for tool_name, count in sorted(tool_usage.items(), key=lambda x: -x[1]):
1144        print(f"  {tool_name}: {count} calls")
1145 
1146    # Show saved notes
1147    if saved_notes:
1148        print("\n" + "=" * 70)
1149        print("NOTES SAVED DURING RESEARCH")
1150        print("=" * 70)
1151        for note in saved_notes:
1152            print(f"\n[{note['id']}] {note['title']}")
1153            if note['tags']:
1154                print(f"   Tags: {', '.join(note['tags'])}")
1155            print(f"   {note['content'][:150]}...")
1156 
1157    # Show written files
1158    if written_files:
1159        print("\n" + "=" * 70)
1160        print("FILES WRITTEN DURING RESEARCH")
1161        print("=" * 70)
1162        for path, content in written_files.items():
1163            print(f"\n{path} ({len(content)} chars)")
1164            print(f"   Preview: {content[:200]}...")
1165 
1166    # Generate a shareable skill
1167    print("\n" + "=" * 70)
1168    print("GENERATING SHAREABLE SKILL")
1169    print("=" * 70)
1170 
1171    generator = SkillGenerator(
1172        api_key=os.getenv("ANTHROPIC_API_KEY"),
1173        base_url="https://api.minimax.io/anthropic",
1174        model="MiniMax-M2.1",
1175    )
1176 
1177    skill_path = generator.generate(
1178        result=result,
1179        skill_name="comprehensive-research-agent",
1180        output_dir="./generated_skills",
1181        title="Comprehensive Research Agent Best Practices",
1182    )
1183 
1184    print(f"\nGenerated skill at: {skill_path}")
1185    print("\nThis skill captures the learnings from optimization and can be shared")
1186    print("with other developers to improve their research agents!")
1187 
1188    # Final summary
1189    print("\n" + "=" * 70)
1190    print("SUMMARY")
1191    print("=" * 70)
1192    print(f"""
1193The optimization loop demonstrated:
1194 
11951. INTERLEAVED THINKING
1196   - {sum(len(i.trace.thinking_blocks) for i in result.iterations)} thinking blocks captured across {result.total_iterations} iterations
1197   - Full visibility into agent reasoning between tool calls
1198 
11992. PATTERN DETECTION
1200   - Identified patterns: {', '.join(set(p.type.value for i in result.iterations for p in i.analysis.patterns)) or 'None'}
1201   - Each pattern includes evidence and suggestions
1202 
12033. PROMPT OPTIMIZATION
1204   - Initial score: {result.initial_score:.1f}
1205   - Final score: {result.final_score:.1f}
1206   - Improvement: {result.improvement_percentage:+.1f}%
1207 
12084. SKILL GENERATION
1209   - Created shareable skill at: {skill_path}
1210   - Captures learnings for other developers
1211 
12125. REAL-WORLD URLS USED
1213   - Anthropic: docs.anthropic.com
1214   - OpenAI: platform.openai.com
1215   - MiniMax: minimax.io
1216   - DAIR.AI: promptingguide.ai
1217   - Research papers: arxiv.org
1218""")
1219 
1220 
1221if __name__ == "__main__":
1222    main()
1223
Preparing the source view

Agent Skills for Context Engineering

examples/interleaved-thinking/examples/03_full_optimization.py