Source from repo
MCP Server Development Guide

Guides creation of high-quality MCP servers in TypeScript or Python (FastMCP) to connect LLMs with external services.
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
118.9 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/evaluation.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code374 linesFree
scripts/evaluation.py
1"""MCP Server Evaluation Harness
2 
3This script evaluates MCP servers by running test questions against them using Claude.
4"""
5 
6import argparse
7import asyncio
8import json
9import re
10import sys
11import time
12import traceback
13import xml.etree.ElementTree as ET
14from pathlib import Path
15from typing import Any
16 
17from anthropic import Anthropic
18 
19from connections import create_connection
20 
21EVALUATION_PROMPT = """You are an AI assistant with access to tools.
22 
23When given a task, you MUST:
241. Use the available tools to complete the task
252. Provide summary of each step in your approach, wrapped in <summary> tags
263. Provide feedback on the tools provided, wrapped in <feedback> tags
274. Provide your final response, wrapped in <response> tags
28 
29Summary Requirements:
30- In your <summary> tags, you must explain:
31  - The steps you took to complete the task
32  - Which tools you used, in what order, and why
33  - The inputs you provided to each tool
34  - The outputs you received from each tool
35  - A summary for how you arrived at the response
36 
37Feedback Requirements:
38- In your <feedback> tags, provide constructive feedback on the tools:
39  - Comment on tool names: Are they clear and descriptive?
40  - Comment on input parameters: Are they well-documented? Are required vs optional parameters clear?
41  - Comment on descriptions: Do they accurately describe what the tool does?
42  - Comment on any errors encountered during tool usage: Did the tool fail to execute? Did the tool return too many tokens?
43  - Identify specific areas for improvement and explain WHY they would help
44  - Be specific and actionable in your suggestions
45 
46Response Requirements:
47- Your response should be concise and directly address what was asked
48- Always wrap your final response in <response> tags
49- If you cannot solve the task return <response>NOT_FOUND</response>
50- For numeric responses, provide just the number
51- For IDs, provide just the ID
52- For names or text, provide the exact text requested
53- Your response should go last"""
54 
55 
56def parse_evaluation_file(file_path: Path) -> list[dict[str, Any]]:
57    """Parse XML evaluation file with qa_pair elements."""
58    try:
59        tree = ET.parse(file_path)
60        root = tree.getroot()
61        evaluations = []
62 
63        for qa_pair in root.findall(".//qa_pair"):
64            question_elem = qa_pair.find("question")
65            answer_elem = qa_pair.find("answer")
66 
67            if question_elem is not None and answer_elem is not None:
68                evaluations.append({
69                    "question": (question_elem.text or "").strip(),
70                    "answer": (answer_elem.text or "").strip(),
71                })
72 
73        return evaluations
74    except Exception as e:
75        print(f"Error parsing evaluation file {file_path}: {e}")
76        return []
77 
78 
79def extract_xml_content(text: str, tag: str) -> str | None:
80    """Extract content from XML tags."""
81    pattern = rf"<{tag}>(.*?)</{tag}>"
82    matches = re.findall(pattern, text, re.DOTALL)
83    return matches[-1].strip() if matches else None
84 
85 
86async def agent_loop(
87    client: Anthropic,
88    model: str,
89    question: str,
90    tools: list[dict[str, Any]],
91    connection: Any,
92) -> tuple[str, dict[str, Any]]:
93    """Run the agent loop with MCP tools."""
94    messages = [{"role": "user", "content": question}]
95 
96    response = await asyncio.to_thread(
97        client.messages.create,
98        model=model,
99        max_tokens=4096,
100        system=EVALUATION_PROMPT,
101        messages=messages,
102        tools=tools,
103    )
104 
105    messages.append({"role": "assistant", "content": response.content})
106 
107    tool_metrics = {}
108 
109    while response.stop_reason == "tool_use":
110        tool_use = next(block for block in response.content if block.type == "tool_use")
111        tool_name = tool_use.name
112        tool_input = tool_use.input
113 
114        tool_start_ts = time.time()
115        try:
116            tool_result = await connection.call_tool(tool_name, tool_input)
117            tool_response = json.dumps(tool_result) if isinstance(tool_result, (dict, list)) else str(tool_result)
118        except Exception as e:
119            tool_response = f"Error executing tool {tool_name}: {str(e)}\n"
120            tool_response += traceback.format_exc()
121        tool_duration = time.time() - tool_start_ts
122 
123        if tool_name not in tool_metrics:
124            tool_metrics[tool_name] = {"count": 0, "durations": []}
125        tool_metrics[tool_name]["count"] += 1
126        tool_metrics[tool_name]["durations"].append(tool_duration)
127 
128        messages.append({
129            "role": "user",
130            "content": [{
131                "type": "tool_result",
132                "tool_use_id": tool_use.id,
133                "content": tool_response,
134            }]
135        })
136 
137        response = await asyncio.to_thread(
138            client.messages.create,
139            model=model,
140            max_tokens=4096,
141            system=EVALUATION_PROMPT,
142            messages=messages,
143            tools=tools,
144        )
145        messages.append({"role": "assistant", "content": response.content})
146 
147    response_text = next(
148        (block.text for block in response.content if hasattr(block, "text")),
149        None,
150    )
151    return response_text, tool_metrics
152 
153 
154async def evaluate_single_task(
155    client: Anthropic,
156    model: str,
157    qa_pair: dict[str, Any],
158    tools: list[dict[str, Any]],
159    connection: Any,
160    task_index: int,
161) -> dict[str, Any]:
162    """Evaluate a single QA pair with the given tools."""
163    start_time = time.time()
164 
165    print(f"Task {task_index + 1}: Running task with question: {qa_pair['question']}")
166    response, tool_metrics = await agent_loop(client, model, qa_pair["question"], tools, connection)
167 
168    response_value = extract_xml_content(response, "response")
169    summary = extract_xml_content(response, "summary")
170    feedback = extract_xml_content(response, "feedback")
171 
172    duration_seconds = time.time() - start_time
173 
174    return {
175        "question": qa_pair["question"],
176        "expected": qa_pair["answer"],
177        "actual": response_value,
178        "score": int(response_value == qa_pair["answer"]) if response_value else 0,
179        "total_duration": duration_seconds,
180        "tool_calls": tool_metrics,
181        "num_tool_calls": sum(len(metrics["durations"]) for metrics in tool_metrics.values()),
182        "summary": summary,
183        "feedback": feedback,
184    }
185 
186 
187REPORT_HEADER = """
188# Evaluation Report
189 
190## Summary
191 
192- **Accuracy**: {correct}/{total} ({accuracy:.1f}%)
193- **Average Task Duration**: {average_duration_s:.2f}s
194- **Average Tool Calls per Task**: {average_tool_calls:.2f}
195- **Total Tool Calls**: {total_tool_calls}
196 
197---
198"""
199 
200TASK_TEMPLATE = """
201### Task {task_num}
202 
203**Question**: {question}
204**Ground Truth Answer**: `{expected_answer}`
205**Actual Answer**: `{actual_answer}`
206**Correct**: {correct_indicator}
207**Duration**: {total_duration:.2f}s
208**Tool Calls**: {tool_calls}
209 
210**Summary**
211{summary}
212 
213**Feedback**
214{feedback}
215 
216---
217"""
218 
219 
220async def run_evaluation(
221    eval_path: Path,
222    connection: Any,
223    model: str = "claude-3-7-sonnet-20250219",
224) -> str:
225    """Run evaluation with MCP server tools."""
226    print("🚀 Starting Evaluation")
227 
228    client = Anthropic()
229 
230    tools = await connection.list_tools()
231    print(f"📋 Loaded {len(tools)} tools from MCP server")
232 
233    qa_pairs = parse_evaluation_file(eval_path)
234    print(f"📋 Loaded {len(qa_pairs)} evaluation tasks")
235 
236    results = []
237    for i, qa_pair in enumerate(qa_pairs):
238        print(f"Processing task {i + 1}/{len(qa_pairs)}")
239        result = await evaluate_single_task(client, model, qa_pair, tools, connection, i)
240        results.append(result)
241 
242    correct = sum(r["score"] for r in results)
243    accuracy = (correct / len(results)) * 100 if results else 0
244    average_duration_s = sum(r["total_duration"] for r in results) / len(results) if results else 0
245    average_tool_calls = sum(r["num_tool_calls"] for r in results) / len(results) if results else 0
246    total_tool_calls = sum(r["num_tool_calls"] for r in results)
247 
248    report = REPORT_HEADER.format(
249        correct=correct,
250        total=len(results),
251        accuracy=accuracy,
252        average_duration_s=average_duration_s,
253        average_tool_calls=average_tool_calls,
254        total_tool_calls=total_tool_calls,
255    )
256 
257    report += "".join([
258        TASK_TEMPLATE.format(
259            task_num=i + 1,
260            question=qa_pair["question"],
261            expected_answer=qa_pair["answer"],
262            actual_answer=result["actual"] or "N/A",
263            correct_indicator="✅" if result["score"] else "❌",
264            total_duration=result["total_duration"],
265            tool_calls=json.dumps(result["tool_calls"], indent=2),
266            summary=result["summary"] or "N/A",
267            feedback=result["feedback"] or "N/A",
268        )
269        for i, (qa_pair, result) in enumerate(zip(qa_pairs, results))
270    ])
271 
272    return report
273 
274 
275def parse_headers(header_list: list[str]) -> dict[str, str]:
276    """Parse header strings in format 'Key: Value' into a dictionary."""
277    headers = {}
278    if not header_list:
279        return headers
280 
281    for header in header_list:
282        if ":" in header:
283            key, value = header.split(":", 1)
284            headers[key.strip()] = value.strip()
285        else:
286            print(f"Warning: Ignoring malformed header: {header}")
287    return headers
288 
289 
290def parse_env_vars(env_list: list[str]) -> dict[str, str]:
291    """Parse environment variable strings in format 'KEY=VALUE' into a dictionary."""
292    env = {}
293    if not env_list:
294        return env
295 
296    for env_var in env_list:
297        if "=" in env_var:
298            key, value = env_var.split("=", 1)
299            env[key.strip()] = value.strip()
300        else:
301            print(f"Warning: Ignoring malformed environment variable: {env_var}")
302    return env
303 
304 
305async def main():
306    parser = argparse.ArgumentParser(
307        description="Evaluate MCP servers using test questions",
308        formatter_class=argparse.RawDescriptionHelpFormatter,
309        epilog="""
310Examples:
311  # Evaluate a local stdio MCP server
312  python evaluation.py -t stdio -c python -a my_server.py eval.xml
313 
314  # Evaluate an SSE MCP server
315  python evaluation.py -t sse -u https://example.com/mcp -H "Authorization: Bearer token" eval.xml
316 
317  # Evaluate an HTTP MCP server with custom model
318  python evaluation.py -t http -u https://example.com/mcp -m claude-3-5-sonnet-20241022 eval.xml
319        """,
320    )
321 
322    parser.add_argument("eval_file", type=Path, help="Path to evaluation XML file")
323    parser.add_argument("-t", "--transport", choices=["stdio", "sse", "http"], default="stdio", help="Transport type (default: stdio)")
324    parser.add_argument("-m", "--model", default="claude-3-7-sonnet-20250219", help="Claude model to use (default: claude-3-7-sonnet-20250219)")
325 
326    stdio_group = parser.add_argument_group("stdio options")
327    stdio_group.add_argument("-c", "--command", help="Command to run MCP server (stdio only)")
328    stdio_group.add_argument("-a", "--args", nargs="+", help="Arguments for the command (stdio only)")
329    stdio_group.add_argument("-e", "--env", nargs="+", help="Environment variables in KEY=VALUE format (stdio only)")
330 
331    remote_group = parser.add_argument_group("sse/http options")
332    remote_group.add_argument("-u", "--url", help="MCP server URL (sse/http only)")
333    remote_group.add_argument("-H", "--header", nargs="+", dest="headers", help="HTTP headers in 'Key: Value' format (sse/http only)")
334 
335    parser.add_argument("-o", "--output", type=Path, help="Output file for evaluation report (default: stdout)")
336 
337    args = parser.parse_args()
338 
339    if not args.eval_file.exists():
340        print(f"Error: Evaluation file not found: {args.eval_file}")
341        sys.exit(1)
342 
343    headers = parse_headers(args.headers) if args.headers else None
344    env_vars = parse_env_vars(args.env) if args.env else None
345 
346    try:
347        connection = create_connection(
348            transport=args.transport,
349            command=args.command,
350            args=args.args,
351            env=env_vars,
352            url=args.url,
353            headers=headers,
354        )
355    except ValueError as e:
356        print(f"Error: {e}")
357        sys.exit(1)
358 
359    print(f"🔗 Connecting to MCP server via {args.transport}...")
360 
361    async with connection:
362        print("✅ Connected successfully")
363        report = await run_evaluation(args.eval_file, connection, args.model)
364 
365        if args.output:
366            args.output.write_text(report)
367            print(f"\n✅ Report saved to {args.output}")
368        else:
369            print("\n" + report)
370 
371 
372if __name__ == "__main__":
373    asyncio.run(main())
374
Preparing the source view

MCP Server Development Guide

scripts/evaluation.py