Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
339
Skill
n/a
Size
4.3 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
researcher/scripts/loop_discover.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code166 linesFree
researcher/scripts/loop_discover.py
1#!/usr/bin/env python3
2"""Discover candidate sources and append them to the inbox.
3 
4By default the discoverer only reads `researcher/discovery/manual-seed.jsonl`.
5Paid feeds (Parallel deep research, web search) are off by default and require
6explicit config opt-in plus separate adapter scripts.
7"""
8 
9from __future__ import annotations
10 
11import argparse
12import json
13import sys
14from pathlib import Path
15from typing import Any
16 
17from loop_common import (
18    QUEUE_DIR,
19    RESEARCHER,
20    active_run_urls,
21    append_jsonl,
22    closed_run_urls,
23    load_config,
24    queue_lock,
25    read_jsonl,
26    source_id_for,
27    utc_now,
28    write_jsonl,
29)
30 
31 
32def load_manual_seed(path: Path) -> list[dict[str, Any]]:
33    if not path.exists():
34        return []
35    records: list[dict[str, Any]] = []
36    for line_number, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
37        if not line.strip():
38            continue
39        try:
40            records.append(json.loads(line))
41        except json.JSONDecodeError as exc:
42            raise ValueError(f"manual-seed line {line_number} invalid JSON: {exc}") from exc
43    return records
44 
45 
46def _normalize_url(url: str) -> str:
47    return url.strip().lower()
48 
49 
50def existing_urls(inbox: list[dict[str, Any]], quarantine: list[dict[str, Any]]) -> set[str]:
51    urls: set[str] = set()
52    for record in inbox + quarantine:
53        url = record.get("url")
54        if isinstance(url, str):
55            urls.add(_normalize_url(url))
56    urls.update(_normalize_url(value) for value in active_run_urls())
57    urls.update(_normalize_url(value) for value in closed_run_urls())
58    return urls
59 
60 
61def normalize_candidate(record: dict[str, Any], feed: str) -> dict[str, Any]:
62    raw_url = str(record.get("url", "")).strip()
63    if not raw_url:
64        raise ValueError("candidate is missing url")
65    normalized = _normalize_url(raw_url)
66    return {
67        "source_id": source_id_for(normalized),
68        "url": raw_url,
69        "url_normalized": normalized,
70        "title": str(record.get("title", "")).strip(),
71        "author_or_org": str(record.get("author_or_org", "")).strip(),
72        "source_type": str(record.get("source_type", "other")).strip(),
73        "candidate_reason": str(record.get("candidate_reason", "")).strip(),
74        "feed": feed,
75        "discovered_at": utc_now(),
76        "attempts": 0,
77        "last_status": "queued",
78    }
79 
80 
81def discover(config: dict[str, Any], dry_run: bool, limit: int | None) -> dict[str, Any]:
82    feeds = config.get("feeds", {})
83    inbox_path = QUEUE_DIR / "inbox.jsonl"
84    quarantine_path = QUEUE_DIR / "quarantine.jsonl"
85    discovery_max = limit if limit is not None else config.get("limits", {}).get("discovery_max_new_per_run", 8)
86    max_inbox_size = config.get("budgets", {}).get("max_inbox_size", 200)
87 
88    with queue_lock("inbox"):
89        inbox = read_jsonl(inbox_path)
90        quarantine = read_jsonl(quarantine_path)
91        seen = existing_urls(inbox, quarantine)
92        new_records: list[dict[str, Any]] = []
93 
94        manual_seed_rel = feeds.get("manual_seed")
95        if manual_seed_rel:
96            manual_path = RESEARCHER.parent / manual_seed_rel
97            for record in load_manual_seed(manual_path):
98                try:
99                    candidate = normalize_candidate(record, feed="manual-seed")
100                except ValueError as exc:
101                    print(f"skipping manual-seed candidate: {exc}", file=sys.stderr)
102                    continue
103                key = candidate.get("url_normalized") or candidate["url"]
104                if key in seen:
105                    continue
106                new_records.append(candidate)
107                seen.add(key)
108                if len(new_records) >= discovery_max:
109                    break
110 
111        if feeds.get("enable_parallel_deep_research"):
112            print(
113                "parallel deep research feed is enabled in config but not implemented in this loop. "
114                "skipping until adapter is added.",
115                file=sys.stderr,
116            )
117        if feeds.get("enable_web_search"):
118            print(
119                "web search feed is enabled in config but not implemented in this loop. "
120                "skipping until adapter is added.",
121                file=sys.stderr,
122            )
123 
124        capacity_remaining = max(0, max_inbox_size - len(inbox))
125        if len(new_records) > capacity_remaining:
126            new_records = new_records[:capacity_remaining]
127 
128        if not dry_run and new_records:
129            inbox.extend(new_records)
130            write_jsonl(inbox_path, inbox)
131 
132        return {
133            "ok": True,
134            "dry_run": dry_run,
135            "new": len(new_records),
136            "inbox_size": len(inbox) if not dry_run else len(inbox) + len(new_records),
137            "capacity_remaining": capacity_remaining - len(new_records),
138            "first_new": new_records[:3],
139        }
140 
141 
142def main() -> int:
143    parser = argparse.ArgumentParser(description="Discover candidate sources and append to inbox")
144    parser.add_argument("--dry-run", action="store_true", help="show what would be added without writing")
145    parser.add_argument("--limit", type=int, default=None, help="override discovery_max_new_per_run")
146    parser.add_argument("--json", action="store_true")
147    args = parser.parse_args()
148 
149    config = load_config()
150    result = discover(config, args.dry_run, args.limit)
151    if args.json:
152        print(json.dumps(result, indent=2))
153    else:
154        suffix = " (dry-run)" if result["dry_run"] else ""
155        print(
156            f"Discovery added {result['new']} sources{suffix}; "
157            f"inbox now {result['inbox_size']}, capacity remaining {result['capacity_remaining']}"
158        )
159        for record in result["first_new"]:
160            print(f"- {record['source_id']} {record['url']}")
161    return 0
162 
163 
164if __name__ == "__main__":
165    sys.exit(main())
166
Agent Skills for Context Engineering

researcher/scripts/loop_discover.py

Preparing the source view

Agent Skills for Context Engineering

researcher/scripts/loop_discover.py