Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Enterprise-grade research with multi-source synthesis, citation tracking, and verification. 8-phase pipeline with auto-continuation.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/citation_manager.py
1#!/usr/bin/env python32"""3Citation Manager — stable source identity and run manifest management.45CLI subcommands:6init-run Create run_manifest.json + empty artifact JSONL files7register-source Append a source to sources.jsonl, return source_id8assign-display-numbers Generate stable_id -> display_number mapping9export-bibliography Render bibliography from sources.jsonl1011Source identity:12source_id = sha256(canonical_locator)[:16]13canonical_locator = doi:..., arxiv:..., or normalized URL1415All state is append-only JSONL. No mutable citation numbers in state files.16"""1718import argparse19import hashlib20import json21import os22import re23import sys24from datetime import datetime, timezone25from urllib.parse import urlparse, urlunparse262728# ---------------------------------------------------------------------------29# Canonical locator normalization30# ---------------------------------------------------------------------------3132DOI_RE = re.compile(r'(?:https?://(?:dx\.)?doi\.org/|doi:)(10\.\d{4,}/\S+)', re.IGNORECASE)33ARXIV_RE = re.compile(r'(?:https?://arxiv\.org/abs/|arxiv:)(\d{4}\.\d{4,}(?:v\d+)?)', re.IGNORECASE)3435# URL query params that are tracking noise, not content identifiers36TRACKING_PARAMS = frozenset([37'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',38'ref', 'source', 'fbclid', 'gclid', 'mc_cid', 'mc_eid',39])404142def canonicalize_locator(raw_url: str) -> str:43"""Derive a canonical locator from a raw URL or identifier string.4445Priority: DOI > arXiv > normalized URL.46"""47# DOI48m = DOI_RE.search(raw_url)49if m:50return f'doi:{m.group(1).rstrip(".")}'5152# arXiv53m = ARXIV_RE.search(raw_url)54if m:55return f'arxiv:{m.group(1)}'5657# Normalized URL: lowercase scheme+host, strip fragment and tracking params58parsed = urlparse(raw_url)59scheme = (parsed.scheme or 'https').lower()60host = (parsed.hostname or '').lower()61path = parsed.path.rstrip('/')62# Filter query params63if parsed.query:64pairs = []65for part in parsed.query.split('&'):66kv = part.split('=', 1)67if kv[0].lower() not in TRACKING_PARAMS:68pairs.append(part)69query = '&'.join(sorted(pairs))70else:71query = ''72return urlunparse((scheme, host, path, '', query, ''))737475def compute_source_id(canonical_locator: str) -> str:76"""sha256(canonical_locator)[:16] hex."""77return hashlib.sha256(canonical_locator.encode('utf-8')).hexdigest()[:16]787980# ---------------------------------------------------------------------------81# JSONL helpers82# ---------------------------------------------------------------------------8384def append_jsonl(path: str, obj: dict) -> None:85with open(path, 'a') as f:86f.write(json.dumps(obj, ensure_ascii=False) + '\n')878889def read_jsonl(path: str) -> list[dict]:90rows = []91if not os.path.exists(path):92return rows93with open(path) as f:94for line in f:95line = line.strip()96if line:97rows.append(json.loads(line))98return rows99100101# ---------------------------------------------------------------------------102# Subcommands103# ---------------------------------------------------------------------------104105def cmd_init_run(args: argparse.Namespace) -> None:106"""Create run_manifest.json and empty JSONL artifact files."""107out_dir = os.path.abspath(args.out_dir)108os.makedirs(out_dir, exist_ok=True)109110artifact_paths = {111'sources': 'sources.jsonl',112'evidence': 'evidence.jsonl',113'claims': 'claims.jsonl',114'report': 'report.md',115}116117manifest = {118'version': '3.0.0',119'query': args.query or '',120'mode': args.mode,121'started_at': datetime.now(timezone.utc).isoformat(),122'finished_at': None,123'assumptions': [],124'provider_config': {125'primary': 'search-cli',126'scholarly': None,127},128'report_dir': out_dir,129'artifact_paths': artifact_paths,130'continuation': None,131}132133manifest_path = os.path.join(out_dir, 'run_manifest.json')134with open(manifest_path, 'w') as f:135json.dump(manifest, f, indent=2, ensure_ascii=False)136f.write('\n')137138# Create empty artifact files139for name in ('sources', 'evidence', 'claims'):140p = os.path.join(out_dir, artifact_paths[name])141if not os.path.exists(p):142open(p, 'w').close()143144print(json.dumps({'status': 'ok', 'manifest': manifest_path, 'dir': out_dir}))145146147def cmd_register_source(args: argparse.Namespace) -> None:148"""Register a source, append to sources.jsonl, print source_id."""149data = json.loads(args.json)150raw_url = data.get('raw_url', data.get('url', ''))151if not raw_url:152print(json.dumps({'error': 'raw_url is required'}), file=sys.stderr)153sys.exit(1)154155canonical = data.get('canonical_locator') or canonicalize_locator(raw_url)156source_id = compute_source_id(canonical)157158sources_path = os.path.join(args.dir, 'sources.jsonl')159160# Check for duplicate161existing = read_jsonl(sources_path)162for row in existing:163if row.get('source_id') == source_id:164print(json.dumps({165'status': 'duplicate',166'source_id': source_id,167'canonical_locator': canonical,168}))169return170171source = {172'source_id': source_id,173'canonical_locator': canonical,174'raw_url': raw_url,175'title': data.get('title', ''),176'authors': data.get('authors'),177'year': data.get('year'),178'source_type': data.get('source_type', 'web'),179'metadata_status': data.get('metadata_status', 'unverified'),180'registered_at': datetime.now(timezone.utc).isoformat(),181}182append_jsonl(sources_path, source)183print(json.dumps({184'status': 'registered',185'source_id': source_id,186'canonical_locator': canonical,187}))188189190def cmd_assign_display_numbers(args: argparse.Namespace) -> None:191"""Read sources.jsonl, assign stable display numbers in registration order."""192sources_path = os.path.join(args.dir, 'sources.jsonl')193sources = read_jsonl(sources_path)194195mapping = {}196for i, src in enumerate(sources, 1):197sid = src['source_id']198if sid not in mapping:199mapping[sid] = i200201print(json.dumps(mapping, indent=2))202203204def cmd_export_bibliography(args: argparse.Namespace) -> None:205"""Generate bibliography from sources.jsonl."""206sources_path = os.path.join(args.dir, 'sources.jsonl')207sources = read_jsonl(sources_path)208209# Deduplicate by source_id, preserve order210seen = set()211unique = []212for src in sources:213if src['source_id'] not in seen:214seen.add(src['source_id'])215unique.append(src)216217style = args.style218219if style == 'markdown':220lines = ['## Bibliography', '']221for i, src in enumerate(unique, 1):222author_str = ''223if src.get('authors'):224authors = src['authors']225if len(authors) == 1:226author_str = f'{authors[0]}. '227elif len(authors) == 2:228author_str = f'{authors[0]} & {authors[1]}. '229else:230author_str = f'{authors[0]} et al. '231232year_str = f'({src["year"]})' if src.get('year') else '(n.d.)'233title = src.get('title', 'Untitled')234url = src.get('raw_url', '')235lines.append(f'[{i}] {author_str}{year_str}. [{title}]({url})')236print('\n'.join(lines))237238elif style == 'json':239out = []240for i, src in enumerate(unique, 1):241out.append({242'display_number': i,243'source_id': src['source_id'],244'canonical_locator': src['canonical_locator'],245'title': src.get('title', ''),246'authors': src.get('authors'),247'year': src.get('year'),248'raw_url': src.get('raw_url', ''),249})250print(json.dumps(out, indent=2, ensure_ascii=False))251252else:253print(f'Unknown style: {style}', file=sys.stderr)254sys.exit(1)255256257# ---------------------------------------------------------------------------258# CLI entry point259# ---------------------------------------------------------------------------260261def main() -> None:262parser = argparse.ArgumentParser(263prog='citation_manager',264description='Stable source identity and run manifest management for deep-research v3.0',265)266sub = parser.add_subparsers(dest='command', required=True)267268# init-run269p_init = sub.add_parser('init-run', help='Create run manifest and empty artifact files')270p_init.add_argument('--out-dir', required=True, help='Output directory for the research run')271p_init.add_argument('--query', default='', help='Original research question')272p_init.add_argument('--mode', default='standard', choices=['quick', 'standard', 'deep', 'ultradeep'])273274# register-source275p_reg = sub.add_parser('register-source', help='Register a source and return its stable ID')276p_reg.add_argument('--json', required=True, help='JSON object with at least raw_url and title')277p_reg.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')278279# assign-display-numbers280p_num = sub.add_parser('assign-display-numbers', help='Map stable source IDs to display numbers')281p_num.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')282283# export-bibliography284p_bib = sub.add_parser('export-bibliography', help='Generate bibliography from sources')285p_bib.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')286p_bib.add_argument('--style', default='markdown', choices=['markdown', 'json'])287288args = parser.parse_args()289290dispatch = {291'init-run': cmd_init_run,292'register-source': cmd_register_source,293'assign-display-numbers': cmd_assign_display_numbers,294'export-bibliography': cmd_export_bibliography,295}296dispatch[args.command](args)297298299if __name__ == '__main__':300main()301