Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
researcher/scripts/novelty_check.py
1#!/usr/bin/env python32"""Check whether a proposed skill idea overlaps existing corpus content."""34from __future__ import annotations56import argparse7import json8import re9import sys10from pathlib import Path11from typing import Any121314ROOT = Path(__file__).resolve().parents[2]15STOPWORDS = {16"the",17"and",18"for",19"with",20"that",21"this",22"from",23"into",24"when",25"agent",26"agents",27"skill",28"skills",29"context",30"activation",31"artifact",32"artifacts",33"author",34"candidate",35"change",36"changes",37"check",38"checks",39"claim",40"claims",41"decision",42"delta",43"evidence",44"existing",45"file",46"gaps",47"human",48"proposal",49"proposed",50"quality",51"retrieval",52"review",53"risk",54"risks",55"source",56"status",57"target",58"type",59}606162STRUCTURED_SECTIONS = [63"Mechanism",64"Skill Target",65"Proposed Delta",66"Risks And Gaps",67"Recommendation",68]697071def tokens(text: str) -> set[str]:72raw = re.findall(r"[a-zA-Z][a-zA-Z0-9_-]{2,}", text.lower())73return {t.replace("_", "-") for t in raw if t not in STOPWORDS}747576def jaccard(a: set[str], b: set[str]) -> float:77if not a or not b:78return 0.079return len(a & b) / len(a | b)808182def read_input(args: argparse.Namespace) -> str:83parts: list[str] = []84if args.text:85parts.append(args.text)86if args.file:87parts.append(Path(args.file).read_text(encoding="utf-8"))88if not parts:89parts.append(sys.stdin.read())90return "\n".join(parts).strip()919293def extract_sections(text: str) -> dict[str, str]:94sections: dict[str, str] = {}95for match in re.finditer(r"^##\s+(.+?)\n(.+?)(?=^##\s+|\Z)", text, flags=re.DOTALL | re.MULTILINE):96sections[match.group(1).strip()] = match.group(2).strip()97return sections9899100def structured_proposal_text(text: str) -> str:101sections = extract_sections(text)102values = [sections[name] for name in STRUCTURED_SECTIONS if sections.get(name)]103return "\n".join(values) if values else salient_text(text)104105106def mechanism_text(entry: dict[str, Any]) -> str:107values: list[str] = []108for key in ["mechanism_id", "owning_skill", "activation_scenario", "behavior_change"]:109value = entry.get(key)110if isinstance(value, str):111values.append(value)112for key in ["failure_modes", "evidence"]:113value = entry.get(key)114if isinstance(value, list):115values.extend(str(item) for item in value)116return "\n".join(values)117118119def load_mechanisms(root: Path) -> list[dict[str, Any]]:120path = root / "researcher" / "mechanisms" / "registry.jsonl"121if not path.exists():122return []123entries: list[dict[str, Any]] = []124for line_number, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):125if not line.strip():126continue127try:128entry = json.loads(line)129except json.JSONDecodeError:130entries.append(131{132"mechanism_id": f"invalid-json-line-{line_number}",133"status": "invalid",134"activation_scenario": "",135"behavior_change": line,136"failure_modes": ["invalid registry entry"],137}138)139continue140if entry.get("status") in {"accepted", "candidate"}:141entries.append(entry)142return entries143144145def salient_text(text: str) -> str:146"""Keep the parts of a proposal most likely to contain real mechanism text."""147section_names = [148"Mechanism",149"Proposed Delta",150"Risks And Gaps",151"Recommendation",152"Core Concepts",153"Detailed Topics",154"Practical Guidance",155"Guidelines",156"Gotchas",157]158chunks: list[str] = []159for section in section_names:160match = re.search(rf"^## {re.escape(section)}\n(.+?)(?:\n## |\Z)", text, flags=re.DOTALL | re.MULTILINE)161if match:162chunks.append(match.group(1))163reduced = "\n".join(chunks) if chunks else text164lines: list[str] = []165for line in reduced.splitlines():166stripped = line.strip()167if not stripped:168continue169if stripped.startswith("- ["):170continue171if stripped in {"| --- | --- | --- |", "| --- | --- |"}:172continue173if "[Short Title]" in stripped or '""' in stripped:174continue175lines.append(stripped)176return "\n".join(lines)177178179def corpus_documents(root: Path, exclude: Path | None = None) -> list[dict[str, str]]:180docs: list[dict[str, str]] = []181for path in sorted((root / "skills").glob("*/SKILL.md")):182if exclude and path.resolve() == exclude.resolve():183continue184docs.append({"path": str(path.relative_to(root)), "text": salient_text(path.read_text(encoding="utf-8"))})185for path in sorted((root / "researcher" / "fixtures").glob("**/*.md")):186if exclude and path.resolve() == exclude.resolve():187continue188docs.append({"path": str(path.relative_to(root)), "text": salient_text(path.read_text(encoding="utf-8"))})189for path in sorted((root / "researcher" / "runs").glob("**/*.md")):190if exclude and path.resolve() == exclude.resolve():191continue192if path.name == "skill-proposal.md":193text = path.read_text(encoding="utf-8")194if "Retrieval status: partial" in text or "[Short Title]" in text:195continue196docs.append({"path": str(path.relative_to(root)), "text": salient_text(path.read_text(encoding="utf-8"))})197return docs198199200def main() -> int:201parser = argparse.ArgumentParser(description="Check novelty of a skill proposal against corpus")202parser.add_argument("--root", type=Path, default=ROOT)203parser.add_argument("--text", help="proposal text to check")204parser.add_argument("--file", type=Path, help="proposal file to check")205parser.add_argument("--threshold", type=float, default=0.18)206parser.add_argument("--json", action="store_true")207args = parser.parse_args()208209proposal = read_input(args)210proposal_structured_text = structured_proposal_text(proposal)211proposal_tokens = tokens(proposal_structured_text)212213mechanism_overlaps: list[dict[str, Any]] = []214for entry in load_mechanisms(args.root):215entry_tokens = tokens(mechanism_text(entry))216score = jaccard(proposal_tokens, entry_tokens)217if score > 0:218mechanism_overlaps.append(219{220"mechanism_id": entry.get("mechanism_id", ""),221"owning_skill": entry.get("owning_skill", ""),222"score": round(score, 4),223"shared_terms": sorted(proposal_tokens & entry_tokens)[:30],224}225)226mechanism_overlaps.sort(key=lambda item: item["score"], reverse=True)227228overlaps: list[dict[str, Any]] = []229exclude = args.file.resolve() if args.file else None230for doc in corpus_documents(args.root, exclude=exclude):231doc_tokens = tokens(doc["text"])232score = jaccard(proposal_tokens, doc_tokens)233shared = sorted(proposal_tokens & doc_tokens)[:30]234if score > 0:235overlaps.append({"path": doc["path"], "score": round(score, 4), "shared_terms": shared})236overlaps.sort(key=lambda item: item["score"], reverse=True)237238max_mechanism_score = mechanism_overlaps[0]["score"] if mechanism_overlaps else 0.0239max_corpus_score = overlaps[0]["score"] if overlaps else 0.0240max_score = max(max_mechanism_score, max_corpus_score)241verdict = "pass"242if max_score >= args.threshold:243verdict = "human_review"244if max_score >= args.threshold * 1.75:245verdict = "likely_duplicate"246247result = {248"verdict": verdict,249"threshold": args.threshold,250"max_score": max_score,251"max_mechanism_score": max_mechanism_score,252"top_mechanism_overlaps": mechanism_overlaps[:10],253"max_corpus_score": max_corpus_score,254"top_overlaps": overlaps[:10],255}256if args.json:257print(json.dumps(result, indent=2))258else:259print(f"Novelty verdict: {verdict} (max overlap {max_score})")260if mechanism_overlaps:261print("Top mechanism overlaps:")262for item in mechanism_overlaps[:5]:263print(264f"- {item['mechanism_id']} ({item['owning_skill']}): "265f"{item['score']} shared={', '.join(item['shared_terms'][:8])}"266)267if overlaps:268print("Top corpus overlaps:")269for item in overlaps[:5]:270print(f"- {item['path']}: {item['score']} shared={', '.join(item['shared_terms'][:8])}")271272return 0 if verdict == "pass" else 2273274275if __name__ == "__main__":276raise SystemExit(main())277