Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Guides creation of high-quality MCP servers in TypeScript or Python (FastMCP) to connect LLMs with external services.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/evaluation.py
1"""MCP Server Evaluation Harness23This script evaluates MCP servers by running test questions against them using Claude.4"""56import argparse7import asyncio8import json9import re10import sys11import time12import traceback13import xml.etree.ElementTree as ET14from pathlib import Path15from typing import Any1617from anthropic import Anthropic1819from connections import create_connection2021EVALUATION_PROMPT = """You are an AI assistant with access to tools.2223When given a task, you MUST:241. Use the available tools to complete the task252. Provide summary of each step in your approach, wrapped in <summary> tags263. Provide feedback on the tools provided, wrapped in <feedback> tags274. Provide your final response, wrapped in <response> tags2829Summary Requirements:30- In your <summary> tags, you must explain:31- The steps you took to complete the task32- Which tools you used, in what order, and why33- The inputs you provided to each tool34- The outputs you received from each tool35- A summary for how you arrived at the response3637Feedback Requirements:38- In your <feedback> tags, provide constructive feedback on the tools:39- Comment on tool names: Are they clear and descriptive?40- Comment on input parameters: Are they well-documented? Are required vs optional parameters clear?41- Comment on descriptions: Do they accurately describe what the tool does?42- Comment on any errors encountered during tool usage: Did the tool fail to execute? Did the tool return too many tokens?43- Identify specific areas for improvement and explain WHY they would help44- Be specific and actionable in your suggestions4546Response Requirements:47- Your response should be concise and directly address what was asked48- Always wrap your final response in <response> tags49- If you cannot solve the task return <response>NOT_FOUND</response>50- For numeric responses, provide just the number51- For IDs, provide just the ID52- For names or text, provide the exact text requested53- Your response should go last"""545556def parse_evaluation_file(file_path: Path) -> list[dict[str, Any]]:57"""Parse XML evaluation file with qa_pair elements."""58try:59tree = ET.parse(file_path)60root = tree.getroot()61evaluations = []6263for qa_pair in root.findall(".//qa_pair"):64question_elem = qa_pair.find("question")65answer_elem = qa_pair.find("answer")6667if question_elem is not None and answer_elem is not None:68evaluations.append({69"question": (question_elem.text or "").strip(),70"answer": (answer_elem.text or "").strip(),71})7273return evaluations74except Exception as e:75print(f"Error parsing evaluation file {file_path}: {e}")76return []777879def extract_xml_content(text: str, tag: str) -> str | None:80"""Extract content from XML tags."""81pattern = rf"<{tag}>(.*?)</{tag}>"82matches = re.findall(pattern, text, re.DOTALL)83return matches[-1].strip() if matches else None848586async def agent_loop(87client: Anthropic,88model: str,89question: str,90tools: list[dict[str, Any]],91connection: Any,92) -> tuple[str, dict[str, Any]]:93"""Run the agent loop with MCP tools."""94messages = [{"role": "user", "content": question}]9596response = await asyncio.to_thread(97client.messages.create,98model=model,99max_tokens=4096,100system=EVALUATION_PROMPT,101messages=messages,102tools=tools,103)104105messages.append({"role": "assistant", "content": response.content})106107tool_metrics = {}108109while response.stop_reason == "tool_use":110tool_use = next(block for block in response.content if block.type == "tool_use")111tool_name = tool_use.name112tool_input = tool_use.input113114tool_start_ts = time.time()115try:116tool_result = await connection.call_tool(tool_name, tool_input)117tool_response = json.dumps(tool_result) if isinstance(tool_result, (dict, list)) else str(tool_result)118except Exception as e:119tool_response = f"Error executing tool {tool_name}: {str(e)}\n"120tool_response += traceback.format_exc()121tool_duration = time.time() - tool_start_ts122123if tool_name not in tool_metrics:124tool_metrics[tool_name] = {"count": 0, "durations": []}125tool_metrics[tool_name]["count"] += 1126tool_metrics[tool_name]["durations"].append(tool_duration)127128messages.append({129"role": "user",130"content": [{131"type": "tool_result",132"tool_use_id": tool_use.id,133"content": tool_response,134}]135})136137response = await asyncio.to_thread(138client.messages.create,139model=model,140max_tokens=4096,141system=EVALUATION_PROMPT,142messages=messages,143tools=tools,144)145messages.append({"role": "assistant", "content": response.content})146147response_text = next(148(block.text for block in response.content if hasattr(block, "text")),149None,150)151return response_text, tool_metrics152153154async def evaluate_single_task(155client: Anthropic,156model: str,157qa_pair: dict[str, Any],158tools: list[dict[str, Any]],159connection: Any,160task_index: int,161) -> dict[str, Any]:162"""Evaluate a single QA pair with the given tools."""163start_time = time.time()164165print(f"Task {task_index + 1}: Running task with question: {qa_pair['question']}")166response, tool_metrics = await agent_loop(client, model, qa_pair["question"], tools, connection)167168response_value = extract_xml_content(response, "response")169summary = extract_xml_content(response, "summary")170feedback = extract_xml_content(response, "feedback")171172duration_seconds = time.time() - start_time173174return {175"question": qa_pair["question"],176"expected": qa_pair["answer"],177"actual": response_value,178"score": int(response_value == qa_pair["answer"]) if response_value else 0,179"total_duration": duration_seconds,180"tool_calls": tool_metrics,181"num_tool_calls": sum(len(metrics["durations"]) for metrics in tool_metrics.values()),182"summary": summary,183"feedback": feedback,184}185186187REPORT_HEADER = """188# Evaluation Report189190## Summary191192- **Accuracy**: {correct}/{total} ({accuracy:.1f}%)193- **Average Task Duration**: {average_duration_s:.2f}s194- **Average Tool Calls per Task**: {average_tool_calls:.2f}195- **Total Tool Calls**: {total_tool_calls}196197---198"""199200TASK_TEMPLATE = """201### Task {task_num}202203**Question**: {question}204**Ground Truth Answer**: `{expected_answer}`205**Actual Answer**: `{actual_answer}`206**Correct**: {correct_indicator}207**Duration**: {total_duration:.2f}s208**Tool Calls**: {tool_calls}209210**Summary**211{summary}212213**Feedback**214{feedback}215216---217"""218219220async def run_evaluation(221eval_path: Path,222connection: Any,223model: str = "claude-3-7-sonnet-20250219",224) -> str:225"""Run evaluation with MCP server tools."""226print("๐ Starting Evaluation")227228client = Anthropic()229230tools = await connection.list_tools()231print(f"๐ Loaded {len(tools)} tools from MCP server")232233qa_pairs = parse_evaluation_file(eval_path)234print(f"๐ Loaded {len(qa_pairs)} evaluation tasks")235236results = []237for i, qa_pair in enumerate(qa_pairs):238print(f"Processing task {i + 1}/{len(qa_pairs)}")239result = await evaluate_single_task(client, model, qa_pair, tools, connection, i)240results.append(result)241242correct = sum(r["score"] for r in results)243accuracy = (correct / len(results)) * 100 if results else 0244average_duration_s = sum(r["total_duration"] for r in results) / len(results) if results else 0245average_tool_calls = sum(r["num_tool_calls"] for r in results) / len(results) if results else 0246total_tool_calls = sum(r["num_tool_calls"] for r in results)247248report = REPORT_HEADER.format(249correct=correct,250total=len(results),251accuracy=accuracy,252average_duration_s=average_duration_s,253average_tool_calls=average_tool_calls,254total_tool_calls=total_tool_calls,255)256257report += "".join([258TASK_TEMPLATE.format(259task_num=i + 1,260question=qa_pair["question"],261expected_answer=qa_pair["answer"],262actual_answer=result["actual"] or "N/A",263correct_indicator="โ " if result["score"] else "โ",264total_duration=result["total_duration"],265tool_calls=json.dumps(result["tool_calls"], indent=2),266summary=result["summary"] or "N/A",267feedback=result["feedback"] or "N/A",268)269for i, (qa_pair, result) in enumerate(zip(qa_pairs, results))270])271272return report273274275def parse_headers(header_list: list[str]) -> dict[str, str]:276"""Parse header strings in format 'Key: Value' into a dictionary."""277headers = {}278if not header_list:279return headers280281for header in header_list:282if ":" in header:283key, value = header.split(":", 1)284headers[key.strip()] = value.strip()285else:286print(f"Warning: Ignoring malformed header: {header}")287return headers288289290def parse_env_vars(env_list: list[str]) -> dict[str, str]:291"""Parse environment variable strings in format 'KEY=VALUE' into a dictionary."""292env = {}293if not env_list:294return env295296for env_var in env_list:297if "=" in env_var:298key, value = env_var.split("=", 1)299env[key.strip()] = value.strip()300else:301print(f"Warning: Ignoring malformed environment variable: {env_var}")302return env303304305async def main():306parser = argparse.ArgumentParser(307description="Evaluate MCP servers using test questions",308formatter_class=argparse.RawDescriptionHelpFormatter,309epilog="""310Examples:311# Evaluate a local stdio MCP server312python evaluation.py -t stdio -c python -a my_server.py eval.xml313314# Evaluate an SSE MCP server315python evaluation.py -t sse -u https://example.com/mcp -H "Authorization: Bearer token" eval.xml316317# Evaluate an HTTP MCP server with custom model318python evaluation.py -t http -u https://example.com/mcp -m claude-3-5-sonnet-20241022 eval.xml319""",320)321322parser.add_argument("eval_file", type=Path, help="Path to evaluation XML file")323parser.add_argument("-t", "--transport", choices=["stdio", "sse", "http"], default="stdio", help="Transport type (default: stdio)")324parser.add_argument("-m", "--model", default="claude-3-7-sonnet-20250219", help="Claude model to use (default: claude-3-7-sonnet-20250219)")325326stdio_group = parser.add_argument_group("stdio options")327stdio_group.add_argument("-c", "--command", help="Command to run MCP server (stdio only)")328stdio_group.add_argument("-a", "--args", nargs="+", help="Arguments for the command (stdio only)")329stdio_group.add_argument("-e", "--env", nargs="+", help="Environment variables in KEY=VALUE format (stdio only)")330331remote_group = parser.add_argument_group("sse/http options")332remote_group.add_argument("-u", "--url", help="MCP server URL (sse/http only)")333remote_group.add_argument("-H", "--header", nargs="+", dest="headers", help="HTTP headers in 'Key: Value' format (sse/http only)")334335parser.add_argument("-o", "--output", type=Path, help="Output file for evaluation report (default: stdout)")336337args = parser.parse_args()338339if not args.eval_file.exists():340print(f"Error: Evaluation file not found: {args.eval_file}")341sys.exit(1)342343headers = parse_headers(args.headers) if args.headers else None344env_vars = parse_env_vars(args.env) if args.env else None345346try:347connection = create_connection(348transport=args.transport,349command=args.command,350args=args.args,351env=env_vars,352url=args.url,353headers=headers,354)355except ValueError as e:356print(f"Error: {e}")357sys.exit(1)358359print(f"๐ Connecting to MCP server via {args.transport}...")360361async with connection:362print("โ Connected successfully")363report = await run_evaluation(args.eval_file, connection, args.model)364365if args.output:366args.output.write_text(report)367print(f"\nโ Report saved to {args.output}")368else:369print("\n" + report)370371372if __name__ == "__main__":373asyncio.run(main())374