Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
researcher/fixtures/source-evaluations/approved-harness-source.json
1{2"evaluation_id": "fixture-approved-harness-source",3"timestamp": "2026-05-14T00:00:00+00:00",4"source": {5"url": "https://github.com/karpathy/autoresearch/blob/master/program.md",6"title": "Karpathy autoresearch program",7"author_or_org": "Andrej Karpathy",8"published_at": "2026-03",9"source_type": "code",10"retrieval_status": "retrieved",11"primary_or_secondary": "primary"12},13"gatekeeper": {14"G1_mechanism_specificity": {15"pass": true,16"evidence": "Defines a concrete autonomous experiment loop with editable and locked files."17},18"G2_implementable_artifacts": {19"pass": true,20"evidence": "Provides program.md instructions, command loop, result logging, and rollback policy."21},22"G3_beyond_basics": {23"pass": true,24"evidence": "Covers long-running autonomous experimentation, metric integrity, and failure handling."25},26"G4_source_verifiability": {27"pass": true,28"evidence": "Public repository from a verifiable researcher."29},30"verdict": "PASS",31"rejection_reason": null32},33"scoring": {34"D1_technical_depth_actionability": {35"reasoning": "The loop can be directly implemented with git, a training command, and result parsing.",36"score": 237},38"D2_repo_relevance": {39"reasoning": "Directly informs harness engineering for autonomous research loops.",40"score": 241},42"D3_evidence_rigor": {43"reasoning": "The source provides runnable code and a locked metric, but not broad comparative evidence.",44"score": 145},46"D4_novelty_insight": {47"reasoning": "The constrained editable-surface framing is a useful operating model for this repo.",48"score": 249},50"weighted_total": 1.8,51"calculation_shown": "(2*0.35) + (2*0.30) + (1*0.20) + (2*0.15) = 1.8"52},53"decision": {54"verdict": "HUMAN_REVIEW",55"override_triggered": "O3",56"confidence": "high",57"justification": "High-value source with concrete harness mechanics. Human review is required because evidence rigor is a 1."58},59"extraction": {60"mechanism": "Constrain autonomy with locked evaluation, narrow editable surfaces, durable result logs, and rollback.",61"implementable_artifacts": [62"program.md loop",63"results.tsv",64"git rollback policy"65],66"failure_modes": [67"metric gaming",68"unbounded loop without logging",69"ambiguous keep/discard decision"70],71"candidate_skill_target": "existing skill",72"candidate_skill_name": "harness-engineering",73"taxonomy_category": "harness_engineering",74"estimated_complexity": "medium"75},76"human_review_notes": "Use as a fixture for sources that pass gates but still require human review through O3."77}78