Source from repo
Deep Research

Enterprise-grade research with multi-source synthesis, citation tracking, and verification. 8-phase pipeline with auto-continuation.
199-biotechnologiesGitHub 199-biotechnologiesSource repo Original GitHub link
Files
Skill
n/a
Size
221.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/citation_manager.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code301 linesFree
scripts/citation_manager.py
1#!/usr/bin/env python3
2"""
3Citation Manager — stable source identity and run manifest management.
4 
5CLI subcommands:
6  init-run             Create run_manifest.json + empty artifact JSONL files
7  register-source      Append a source to sources.jsonl, return source_id
8  assign-display-numbers  Generate stable_id -> display_number mapping
9  export-bibliography   Render bibliography from sources.jsonl
10 
11Source identity:
12  source_id = sha256(canonical_locator)[:16]
13  canonical_locator = doi:..., arxiv:..., or normalized URL
14 
15All state is append-only JSONL. No mutable citation numbers in state files.
16"""
17 
18import argparse
19import hashlib
20import json
21import os
22import re
23import sys
24from datetime import datetime, timezone
25from urllib.parse import urlparse, urlunparse
26 
27 
28# ---------------------------------------------------------------------------
29# Canonical locator normalization
30# ---------------------------------------------------------------------------
31 
32DOI_RE = re.compile(r'(?:https?://(?:dx\.)?doi\.org/|doi:)(10\.\d{4,}/\S+)', re.IGNORECASE)
33ARXIV_RE = re.compile(r'(?:https?://arxiv\.org/abs/|arxiv:)(\d{4}\.\d{4,}(?:v\d+)?)', re.IGNORECASE)
34 
35# URL query params that are tracking noise, not content identifiers
36TRACKING_PARAMS = frozenset([
37    'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
38    'ref', 'source', 'fbclid', 'gclid', 'mc_cid', 'mc_eid',
39])
40 
41 
42def canonicalize_locator(raw_url: str) -> str:
43    """Derive a canonical locator from a raw URL or identifier string.
44 
45    Priority: DOI > arXiv > normalized URL.
46    """
47    # DOI
48    m = DOI_RE.search(raw_url)
49    if m:
50        return f'doi:{m.group(1).rstrip(".")}'
51 
52    # arXiv
53    m = ARXIV_RE.search(raw_url)
54    if m:
55        return f'arxiv:{m.group(1)}'
56 
57    # Normalized URL: lowercase scheme+host, strip fragment and tracking params
58    parsed = urlparse(raw_url)
59    scheme = (parsed.scheme or 'https').lower()
60    host = (parsed.hostname or '').lower()
61    path = parsed.path.rstrip('/')
62    # Filter query params
63    if parsed.query:
64        pairs = []
65        for part in parsed.query.split('&'):
66            kv = part.split('=', 1)
67            if kv[0].lower() not in TRACKING_PARAMS:
68                pairs.append(part)
69        query = '&'.join(sorted(pairs))
70    else:
71        query = ''
72    return urlunparse((scheme, host, path, '', query, ''))
73 
74 
75def compute_source_id(canonical_locator: str) -> str:
76    """sha256(canonical_locator)[:16] hex."""
77    return hashlib.sha256(canonical_locator.encode('utf-8')).hexdigest()[:16]
78 
79 
80# ---------------------------------------------------------------------------
81# JSONL helpers
82# ---------------------------------------------------------------------------
83 
84def append_jsonl(path: str, obj: dict) -> None:
85    with open(path, 'a') as f:
86        f.write(json.dumps(obj, ensure_ascii=False) + '\n')
87 
88 
89def read_jsonl(path: str) -> list[dict]:
90    rows = []
91    if not os.path.exists(path):
92        return rows
93    with open(path) as f:
94        for line in f:
95            line = line.strip()
96            if line:
97                rows.append(json.loads(line))
98    return rows
99 
100 
101# ---------------------------------------------------------------------------
102# Subcommands
103# ---------------------------------------------------------------------------
104 
105def cmd_init_run(args: argparse.Namespace) -> None:
106    """Create run_manifest.json and empty JSONL artifact files."""
107    out_dir = os.path.abspath(args.out_dir)
108    os.makedirs(out_dir, exist_ok=True)
109 
110    artifact_paths = {
111        'sources': 'sources.jsonl',
112        'evidence': 'evidence.jsonl',
113        'claims': 'claims.jsonl',
114        'report': 'report.md',
115    }
116 
117    manifest = {
118        'version': '3.0.0',
119        'query': args.query or '',
120        'mode': args.mode,
121        'started_at': datetime.now(timezone.utc).isoformat(),
122        'finished_at': None,
123        'assumptions': [],
124        'provider_config': {
125            'primary': 'search-cli',
126            'scholarly': None,
127        },
128        'report_dir': out_dir,
129        'artifact_paths': artifact_paths,
130        'continuation': None,
131    }
132 
133    manifest_path = os.path.join(out_dir, 'run_manifest.json')
134    with open(manifest_path, 'w') as f:
135        json.dump(manifest, f, indent=2, ensure_ascii=False)
136        f.write('\n')
137 
138    # Create empty artifact files
139    for name in ('sources', 'evidence', 'claims'):
140        p = os.path.join(out_dir, artifact_paths[name])
141        if not os.path.exists(p):
142            open(p, 'w').close()
143 
144    print(json.dumps({'status': 'ok', 'manifest': manifest_path, 'dir': out_dir}))
145 
146 
147def cmd_register_source(args: argparse.Namespace) -> None:
148    """Register a source, append to sources.jsonl, print source_id."""
149    data = json.loads(args.json)
150    raw_url = data.get('raw_url', data.get('url', ''))
151    if not raw_url:
152        print(json.dumps({'error': 'raw_url is required'}), file=sys.stderr)
153        sys.exit(1)
154 
155    canonical = data.get('canonical_locator') or canonicalize_locator(raw_url)
156    source_id = compute_source_id(canonical)
157 
158    sources_path = os.path.join(args.dir, 'sources.jsonl')
159 
160    # Check for duplicate
161    existing = read_jsonl(sources_path)
162    for row in existing:
163        if row.get('source_id') == source_id:
164            print(json.dumps({
165                'status': 'duplicate',
166                'source_id': source_id,
167                'canonical_locator': canonical,
168            }))
169            return
170 
171    source = {
172        'source_id': source_id,
173        'canonical_locator': canonical,
174        'raw_url': raw_url,
175        'title': data.get('title', ''),
176        'authors': data.get('authors'),
177        'year': data.get('year'),
178        'source_type': data.get('source_type', 'web'),
179        'metadata_status': data.get('metadata_status', 'unverified'),
180        'registered_at': datetime.now(timezone.utc).isoformat(),
181    }
182    append_jsonl(sources_path, source)
183    print(json.dumps({
184        'status': 'registered',
185        'source_id': source_id,
186        'canonical_locator': canonical,
187    }))
188 
189 
190def cmd_assign_display_numbers(args: argparse.Namespace) -> None:
191    """Read sources.jsonl, assign stable display numbers in registration order."""
192    sources_path = os.path.join(args.dir, 'sources.jsonl')
193    sources = read_jsonl(sources_path)
194 
195    mapping = {}
196    for i, src in enumerate(sources, 1):
197        sid = src['source_id']
198        if sid not in mapping:
199            mapping[sid] = i
200 
201    print(json.dumps(mapping, indent=2))
202 
203 
204def cmd_export_bibliography(args: argparse.Namespace) -> None:
205    """Generate bibliography from sources.jsonl."""
206    sources_path = os.path.join(args.dir, 'sources.jsonl')
207    sources = read_jsonl(sources_path)
208 
209    # Deduplicate by source_id, preserve order
210    seen = set()
211    unique = []
212    for src in sources:
213        if src['source_id'] not in seen:
214            seen.add(src['source_id'])
215            unique.append(src)
216 
217    style = args.style
218 
219    if style == 'markdown':
220        lines = ['## Bibliography', '']
221        for i, src in enumerate(unique, 1):
222            author_str = ''
223            if src.get('authors'):
224                authors = src['authors']
225                if len(authors) == 1:
226                    author_str = f'{authors[0]}. '
227                elif len(authors) == 2:
228                    author_str = f'{authors[0]} & {authors[1]}. '
229                else:
230                    author_str = f'{authors[0]} et al. '
231 
232            year_str = f'({src["year"]})' if src.get('year') else '(n.d.)'
233            title = src.get('title', 'Untitled')
234            url = src.get('raw_url', '')
235            lines.append(f'[{i}] {author_str}{year_str}. [{title}]({url})')
236        print('\n'.join(lines))
237 
238    elif style == 'json':
239        out = []
240        for i, src in enumerate(unique, 1):
241            out.append({
242                'display_number': i,
243                'source_id': src['source_id'],
244                'canonical_locator': src['canonical_locator'],
245                'title': src.get('title', ''),
246                'authors': src.get('authors'),
247                'year': src.get('year'),
248                'raw_url': src.get('raw_url', ''),
249            })
250        print(json.dumps(out, indent=2, ensure_ascii=False))
251 
252    else:
253        print(f'Unknown style: {style}', file=sys.stderr)
254        sys.exit(1)
255 
256 
257# ---------------------------------------------------------------------------
258# CLI entry point
259# ---------------------------------------------------------------------------
260 
261def main() -> None:
262    parser = argparse.ArgumentParser(
263        prog='citation_manager',
264        description='Stable source identity and run manifest management for deep-research v3.0',
265    )
266    sub = parser.add_subparsers(dest='command', required=True)
267 
268    # init-run
269    p_init = sub.add_parser('init-run', help='Create run manifest and empty artifact files')
270    p_init.add_argument('--out-dir', required=True, help='Output directory for the research run')
271    p_init.add_argument('--query', default='', help='Original research question')
272    p_init.add_argument('--mode', default='standard', choices=['quick', 'standard', 'deep', 'ultradeep'])
273 
274    # register-source
275    p_reg = sub.add_parser('register-source', help='Register a source and return its stable ID')
276    p_reg.add_argument('--json', required=True, help='JSON object with at least raw_url and title')
277    p_reg.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
278 
279    # assign-display-numbers
280    p_num = sub.add_parser('assign-display-numbers', help='Map stable source IDs to display numbers')
281    p_num.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
282 
283    # export-bibliography
284    p_bib = sub.add_parser('export-bibliography', help='Generate bibliography from sources')
285    p_bib.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
286    p_bib.add_argument('--style', default='markdown', choices=['markdown', 'json'])
287 
288    args = parser.parse_args()
289 
290    dispatch = {
291        'init-run': cmd_init_run,
292        'register-source': cmd_register_source,
293        'assign-display-numbers': cmd_assign_display_numbers,
294        'export-bibliography': cmd_export_bibliography,
295    }
296    dispatch[args.command](args)
297 
298 
299if __name__ == '__main__':
300    main()
301
Preparing the source view

Deep Research

scripts/citation_manager.py