Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
339
Skill
n/a
Size
4.3 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
researcher/scripts/loop_step.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code425 linesFree
researcher/scripts/loop_step.py
1#!/usr/bin/env python3
2"""Advance the continuous research loop by one step.
3 
4A single invocation either:
5 
61. Pulls the oldest inbox item into a new run (if under budget).
72. Advances the oldest active run by exactly one safe state transition.
83. Parks runs that need human or model judgment.
94. Returns exit code 78 when there is no safe work to do.
10 
11The loop is intentionally conservative. It never invokes LLMs and never makes
12network calls that are not whitelisted. The default `--allow-fetch` is off, so
13the loop scheduler that runs unattended only manages bookkeeping until a human
14or another adapter retrieves the source.
15"""
16 
17from __future__ import annotations
18 
19import argparse
20import json
21import re
22import subprocess
23import sys
24import urllib.error
25import urllib.request
26from datetime import datetime, timezone
27from pathlib import Path
28from typing import Any
29 
30from loop_common import (
31    QUEUE_DIR,
32    REPORTS_DIR,
33    RESEARCHER,
34    ROOT,
35    RUNS_DIR,
36    append_jsonl,
37    categorize_runs,
38    load_config,
39    load_run_state,
40    queue_lock,
41    read_jsonl,
42    runs_created_today,
43    today_utc,
44    utc_now,
45    write_jsonl,
46)
47 
48 
49FAILURE_LOG = REPORTS_DIR / "loop-failures.jsonl"
50LOOP_EVENTS = REPORTS_DIR / "loop-events.jsonl"
51PARKED_FILE = QUEUE_DIR / "parked.jsonl"
52DONE_FILE = QUEUE_DIR / "done.jsonl"
53QUARANTINE_FILE = QUEUE_DIR / "quarantine.jsonl"
54INBOX_FILE = QUEUE_DIR / "inbox.jsonl"
55RESEARCH_LOOP = RESEARCHER / "scripts" / "research_loop.py"
56USER_AGENT = "context-engineering-researcher/0.1 (+https://github.com/muratcankoylan/Agent-Skills-for-Context-Engineering)"
57MAX_FETCH_BYTES = 1_500_000
58 
59 
60def record_event(event: dict[str, Any]) -> None:
61    event = dict(event)
62    event["timestamp"] = utc_now()
63    append_jsonl(LOOP_EVENTS, event)
64 
65 
66def record_failure(event: dict[str, Any]) -> None:
67    event = dict(event)
68    event["timestamp"] = utc_now()
69    append_jsonl(FAILURE_LOG, event)
70 
71 
72def failures_today() -> int:
73    if not FAILURE_LOG.exists():
74        return 0
75    today = today_utc()
76    count = 0
77    for line in FAILURE_LOG.read_text(encoding="utf-8").splitlines():
78        if not line.strip():
79            continue
80        try:
81            record = json.loads(line)
82        except json.JSONDecodeError:
83            continue
84        if record.get("timestamp", "").startswith(today):
85            count += 1
86    return count
87 
88 
89def park_run(run_id: str, reason: str) -> None:
90    with queue_lock("parked"):
91        parked = read_jsonl(PARKED_FILE)
92        if any(record.get("run_id") == run_id for record in parked):
93            return
94        parked.append({"run_id": run_id, "reason": reason, "parked_at": utc_now()})
95        write_jsonl(PARKED_FILE, parked)
96 
97 
98def unpark_run(run_id: str) -> None:
99    with queue_lock("parked"):
100        parked = [record for record in read_jsonl(PARKED_FILE) if record.get("run_id") != run_id]
101        write_jsonl(PARKED_FILE, parked)
102 
103 
104def mark_done(run_id: str, status: str, reason: str) -> None:
105    with queue_lock("done"):
106        done = read_jsonl(DONE_FILE)
107        if any(record.get("run_id") == run_id for record in done):
108            return
109        done.append({"run_id": run_id, "status": status, "reason": reason, "closed_at": utc_now()})
110        write_jsonl(DONE_FILE, done)
111 
112 
113def quarantine_source(record: dict[str, Any], reason: str) -> None:
114    record = dict(record)
115    record["quarantined_at"] = utc_now()
116    record["quarantine_reason"] = reason
117    with queue_lock("quarantine"):
118        append_jsonl(QUARANTINE_FILE, record)
119 
120 
121def peek_inbox_item() -> dict[str, Any] | None:
122    inbox = read_jsonl(INBOX_FILE)
123    if not inbox:
124        return None
125    inbox.sort(key=lambda record: record.get("discovered_at", ""))
126    return inbox[0]
127 
128 
129def remove_inbox_item(source_id: str) -> None:
130    inbox = read_jsonl(INBOX_FILE)
131    inbox = [record for record in inbox if record.get("source_id") != source_id]
132    write_jsonl(INBOX_FILE, inbox)
133 
134 
135def reap_closed_runs() -> list[dict[str, Any]]:
136    events: list[dict[str, Any]] = []
137    if not RUNS_DIR.exists():
138        return events
139    done_ids = {record.get("run_id") for record in read_jsonl(DONE_FILE)}
140    for run_dir in sorted(RUNS_DIR.iterdir()):
141        if not run_dir.is_dir():
142            continue
143        state = load_run_state(run_dir)
144        if not state or state.get("current_state") != "closed":
145            continue
146        if run_dir.name in done_ids:
147            continue
148        unpark_run(run_dir.name)
149        mark_done(
150            run_dir.name,
151            status=state.get("close_status") or "unknown",
152            reason=state.get("close_reason") or "",
153        )
154        events.append({"action": "reaped", "run_id": run_dir.name})
155    return events
156 
157 
158def init_run(source: dict[str, Any]) -> dict[str, Any]:
159    cmd = [
160        sys.executable,
161        str(RESEARCH_LOOP),
162        "init",
163        "--title",
164        source.get("title") or source.get("url") or "untitled",
165        "--url",
166        source.get("url", ""),
167        "--author-or-org",
168        source.get("author_or_org", ""),
169        "--source-type",
170        normalize_source_type(source.get("source_type")),
171        "--reason",
172        source.get("candidate_reason", ""),
173    ]
174    completed = subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True, check=False)
175    if completed.returncode != 0:
176        return {"ok": False, "stderr": completed.stderr.strip(), "stdout": completed.stdout.strip()}
177    run_relative = completed.stdout.strip().splitlines()[-1] if completed.stdout.strip() else ""
178    return {"ok": True, "run_relative": run_relative}
179 
180 
181def normalize_source_type(value: str | None) -> str:
182    allowed = {"paper", "engineering_blog", "documentation", "benchmark", "code", "talk", "other"}
183    if isinstance(value, str) and value in allowed:
184        return value
185    return "other"
186 
187 
188def fetch_url(url: str, dest_dir: Path) -> dict[str, Any]:
189    if not url.lower().startswith(("http://", "https://")):
190        return {"ok": False, "error": f"unsupported url scheme: {url[:32]}", "url": url}
191    dest_dir.mkdir(parents=True, exist_ok=True)
192    safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", url).strip("-")[:120] or "source"
193    target = dest_dir / f"{safe_name}.html"
194    request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html,*/*"})
195    try:
196        with urllib.request.urlopen(request, timeout=30) as response:
197            final_url = getattr(response, "url", url)
198            if not final_url.lower().startswith(("http://", "https://")):
199                return {"ok": False, "error": "redirect changed scheme", "url": final_url}
200            content_type = response.headers.get("Content-Type", "")
201            data = response.read(MAX_FETCH_BYTES + 1)
202            truncated = len(data) > MAX_FETCH_BYTES
203            data = data[:MAX_FETCH_BYTES]
204            target.write_bytes(data)
205            return {
206                "ok": True,
207                "path": str(target.relative_to(ROOT)),
208                "bytes": len(data),
209                "content_type": content_type,
210                "truncated": truncated,
211                "final_url": final_url,
212            }
213    except urllib.error.HTTPError as exc:
214        return {"ok": False, "error": f"http {exc.code}", "url": url}
215    except urllib.error.URLError as exc:
216        return {"ok": False, "error": f"network error: {exc.reason}", "url": url}
217    except TimeoutError:
218        return {"ok": False, "error": "timeout", "url": url}
219 
220 
221def attempt_retrieval(run_dir: Path, url: str) -> dict[str, Any]:
222    raw_dir = run_dir / "sources" / "evidence" / "raw"
223    return fetch_url(url, raw_dir)
224 
225 
226def append_evidence_pointer(run_dir: Path, raw_record: dict[str, Any]) -> None:
227    summary_path = run_dir / "sources" / "evidence" / "retrieval.md"
228    summary_path.parent.mkdir(parents=True, exist_ok=True)
229    line = (
230        f"- {raw_record.get('path', '')} ({raw_record.get('bytes', 0)} bytes, "
231        f"{raw_record.get('content_type', '')}) retrieved at {utc_now()}\n"
232    )
233    with summary_path.open("a", encoding="utf-8") as handle:
234        handle.write(line)
235 
236 
237def update_run_queue_retrieval(run_dir: Path, status: str, raw_paths: list[str], notes: str) -> None:
238    queue = run_dir / "sources" / "queue.jsonl"
239    if not queue.exists():
240        return
241    records = [json.loads(line) for line in queue.read_text(encoding="utf-8").splitlines() if line.strip()]
242    if not records:
243        return
244    records[0].update(
245        {
246            "retrieval_status": status,
247            "retrieved_at": utc_now(),
248            "raw_evidence": raw_paths,
249            "retrieval_notes": notes,
250        }
251    )
252    queue.write_text("\n".join(json.dumps(record, sort_keys=True) for record in records) + "\n", encoding="utf-8")
253 
254 
255def advance_initialized(run_dir: Path, state: dict[str, Any], allow_fetch: bool) -> dict[str, Any]:
256    url = state.get("source_url")
257    run_id = run_dir.name
258    if not url:
259        park_run(run_id, "no source URL on run-state.json")
260        return {"action": "parked", "run_id": run_id, "reason": "no source URL"}
261    if not allow_fetch:
262        park_run(run_id, "automatic retrieval disabled; needs manual retrieve")
263        return {"action": "parked", "run_id": run_id, "reason": "fetch disabled"}
264    fetched = attempt_retrieval(run_dir, url)
265    if not fetched.get("ok"):
266        park_run(run_id, f"retrieval failed: {fetched.get('error')}")
267        record_failure({"phase": "retrieval", "run_id": run_id, "url": url, "error": fetched.get("error")})
268        return {"action": "parked", "run_id": run_id, "reason": fetched.get("error")}
269    append_evidence_pointer(run_dir, fetched)
270    update_run_queue_retrieval(
271        run_dir,
272        status="retrieved",
273        raw_paths=[fetched["path"]],
274        notes=f"auto fetch via loop_step; content_type={fetched.get('content_type', '')}",
275    )
276    completed = subprocess.run(
277        [
278            sys.executable,
279            str(RESEARCH_LOOP),
280            "retrieve",
281            "--run-dir",
282            str(run_dir),
283            "--notes",
284            "auto fetch via loop_step",
285        ],
286        cwd=ROOT,
287        capture_output=True,
288        text=True,
289        check=False,
290    )
291    if completed.returncode != 0:
292        record_failure(
293            {"phase": "retrieve-state", "run_id": run_id, "stderr": completed.stderr.strip()}
294        )
295        park_run(run_id, "could not record retrieved state")
296        return {"action": "parked", "run_id": run_id, "reason": "state transition failed"}
297    return {"action": "advanced", "run_id": run_id, "to_state": "retrieved", "bytes": fetched.get("bytes")}
298 
299 
300def advance_retrieved(run_dir: Path) -> dict[str, Any]:
301    run_id = run_dir.name
302    park_run(run_id, "needs source evaluation by human or judge agent")
303    return {"action": "parked", "run_id": run_id, "reason": "needs evaluation"}
304 
305 
306def advance_run(run_dir: Path, allow_fetch: bool) -> dict[str, Any]:
307    state = load_run_state(run_dir)
308    if not state:
309        park_run(run_dir.name, "missing run-state.json")
310        return {"action": "parked", "run_id": run_dir.name, "reason": "missing state"}
311    current = state.get("current_state")
312    if current == "initialized":
313        return advance_initialized(run_dir, state, allow_fetch)
314    if current == "retrieved":
315        return advance_retrieved(run_dir)
316    if current in {"evaluated", "proposed", "novelty_checked", "validated"}:
317        park_run(run_dir.name, f"needs human or model action from state {current}")
318        return {"action": "parked", "run_id": run_dir.name, "reason": f"needs action from {current}"}
319    if current == "pr_ready":
320        park_run(run_dir.name, "PR is ready for human merge approval")
321        return {"action": "parked", "run_id": run_dir.name, "reason": "needs merge approval"}
322    if current == "closed":
323        unpark_run(run_dir.name)
324        mark_done(
325            run_dir.name,
326            status=state.get("close_status") or "unknown",
327            reason=state.get("close_reason") or "",
328        )
329        return {"action": "closed", "run_id": run_dir.name}
330    park_run(run_dir.name, f"unknown current state {current}")
331    return {"action": "parked", "run_id": run_dir.name, "reason": f"unknown state {current}"}
332 
333 
334def loop_step(config: dict[str, Any], allow_fetch: bool) -> dict[str, Any]:
335    budgets = config.get("budgets", {})
336    max_active = budgets.get("max_active_runs", 3)
337    max_runs_today = budgets.get("max_runs_per_day", 6)
338    max_failures = budgets.get("max_failures_per_day", 5)
339    max_parked = budgets.get("max_parked", 12)
340    # `mode` governs future LLM-judge feeds, not stdlib HTTP retrieval. Retrieval is
341    # controlled by --allow-fetch alone so the daemon can stage evidence without
342    # incurring paid-API spend.
343 
344    reaped = reap_closed_runs()
345    for event in reaped:
346        record_event({"phase": "reap", **event})
347 
348    if failures_today() >= max_failures:
349        record_event({"action": "stop", "reason": "failure budget exceeded"})
350        return {"ok": True, "action": "stop", "reason": "failure budget exceeded"}
351 
352    buckets = categorize_runs()
353    active = buckets["active"]
354    parked = read_jsonl(PARKED_FILE)
355    if len(parked) >= max_parked:
356        record_event({"action": "stop", "reason": "parked queue full"})
357        return {"ok": True, "action": "stop", "reason": "parked queue full"}
358 
359    if len(active) < max_active and runs_created_today() < max_runs_today:
360        with queue_lock("inbox"):
361            # Re-check budgets inside the lock so concurrent loop_step invocations
362            # cannot both pass the pre-check and exceed max_active/max_per_day.
363            current_buckets = categorize_runs()
364            if (
365                len(current_buckets["active"]) < max_active
366                and runs_created_today() < max_runs_today
367            ):
368                source = peek_inbox_item()
369                if source:
370                    init = init_run(source)
371                    if not init.get("ok"):
372                        remove_inbox_item(source.get("source_id"))
373                        quarantine_source(
374                            source,
375                            reason=f"init failed: {init.get('stderr', '')[:200]}",
376                        )
377                        record_failure(
378                            {
379                                "phase": "init",
380                                "source": source.get("url"),
381                                "stderr": init.get("stderr"),
382                            }
383                        )
384                        record_event({"action": "init-failed", "source": source.get("url")})
385                        return {"ok": True, "action": "init-failed", "source": source.get("url")}
386                    remove_inbox_item(source.get("source_id"))
387                    event = {
388                        "action": "initialized",
389                        "source": source.get("url"),
390                        "run": init.get("run_relative"),
391                    }
392                    record_event(event)
393                    return {"ok": True, **event}
394 
395    if not active:
396        record_event({"action": "no-op", "reason": "no active runs and inbox empty or budget reached"})
397        return {"ok": True, "action": "no-op"}
398 
399    active.sort(key=lambda path: load_run_state(path).get("updated_at", "") if load_run_state(path) else "")
400    target = active[0]
401    result = advance_run(target, allow_fetch)
402    record_event({"phase": "advance", **result})
403    return {"ok": True, **result}
404 
405 
406def main() -> int:
407    parser = argparse.ArgumentParser(description="Advance the continuous research loop by one step")
408    parser.add_argument("--allow-fetch", action="store_true", help="permit HTTP GET retrieval of source URLs")
409    parser.add_argument("--json", action="store_true")
410    args = parser.parse_args()
411 
412    config = load_config()
413    result = loop_step(config, args.allow_fetch)
414    if args.json:
415        print(json.dumps(result, indent=2))
416    else:
417        print(f"loop_step: {result.get('action')} {result.get('reason') or ''}".strip())
418    if result.get("action") == "no-op":
419        return 78
420    return 0
421 
422 
423if __name__ == "__main__":
424    sys.exit(main())
425
Preparing the source view

Agent Skills for Context Engineering

researcher/scripts/loop_step.py