Source from repo
Deep Research

Enterprise-grade research with multi-source synthesis, citation tracking, and verification. 8-phase pipeline with auto-continuation.
199-biotechnologiesGitHub 199-biotechnologiesSource repo Original GitHub link
Files
Skill
n/a
Size
221.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/extract_claims.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code359 linesFree
scripts/extract_claims.py
1#!/usr/bin/env python3
2"""
3Atomic Claim Extractor — decomposes report sections into typed claims.
4 
5CLI subcommands:
6  extract      Parse a markdown report into atomic claims (claims.jsonl)
7  add          Manually add a single claim
8  list         List claims, optionally filtered by section or type
9  stats        Show claim statistics (counts by type/status)
10 
11Claim identity:
12  claim_id = sha256(section_id + normalized_text)[:16]
13 
14Claim types (per GPT Pro's refinement of Codex's proposal):
15  - factual: hard-fails on lack of support
16  - synthesis: needs traceability, softer threshold
17  - recommendation: needs traceability, softer threshold
18  - speculation: labeled, no support gate
19"""
20 
21import argparse
22import hashlib
23import json
24import os
25import re
26import sys
27from datetime import datetime, timezone
28 
29 
30# ---------------------------------------------------------------------------
31# Claim ID computation
32# ---------------------------------------------------------------------------
33 
34_WHITESPACE_RE = re.compile(r'\s+')
35 
36 
37def normalize_text(text: str) -> str:
38    """Normalize for stable hashing."""
39    return _WHITESPACE_RE.sub(' ', text.strip()).lower()
40 
41 
42def compute_claim_id(section_id: str, text: str) -> str:
43    """sha256(section_id + normalized_text)[:16] hex."""
44    payload = section_id + normalize_text(text)
45    return hashlib.sha256(payload.encode('utf-8')).hexdigest()[:16]
46 
47 
48# ---------------------------------------------------------------------------
49# JSONL helpers
50# ---------------------------------------------------------------------------
51 
52def append_jsonl(path: str, obj: dict) -> None:
53    with open(path, 'a') as f:
54        f.write(json.dumps(obj, ensure_ascii=False) + '\n')
55 
56 
57def read_jsonl(path: str) -> list[dict]:
58    rows = []
59    if not os.path.exists(path):
60        return rows
61    with open(path) as f:
62        for line in f:
63            line = line.strip()
64            if line:
65                rows.append(json.loads(line))
66    return rows
67 
68 
69# ---------------------------------------------------------------------------
70# Report parsing helpers
71# ---------------------------------------------------------------------------
72 
73# Section header patterns
74SECTION_PATTERNS = [
75    (re.compile(r'^##\s+Executive\s+Summary', re.I), 'executive_summary'),
76    (re.compile(r'^##\s+Introduction', re.I), 'introduction'),
77    (re.compile(r'^##\s+Finding\s+(\d+)', re.I), lambda m: f'finding_{m.group(1)}'),
78    (re.compile(r'^##\s+Synthesis', re.I), 'synthesis'),
79    (re.compile(r'^##\s+Limitations', re.I), 'limitations'),
80    (re.compile(r'^##\s+Recommendations', re.I), 'recommendations'),
81    (re.compile(r'^##\s+Conclusion', re.I), 'conclusion'),
82    (re.compile(r'^##\s+(.+)', re.I), lambda m: re.sub(r'\W+', '_', m.group(1).strip().lower())[:30]),
83]
84 
85# Citation pattern [N] or [N, M]
86CITATION_RE = re.compile(r'\[(\d+(?:,\s*\d+)*)\]')
87 
88# Sentence splitting (basic but handles abbreviations)
89SENTENCE_RE = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
90 
91 
92def classify_claim(text: str, section_id: str) -> str:
93    """Heuristic claim type classification."""
94    lower = text.lower()
95 
96    # Recommendation indicators
97    if any(w in lower for w in ['should', 'recommend', 'suggest', 'advise', 'consider']):
98        if section_id == 'recommendations':
99            return 'recommendation'
100        return 'recommendation'
101 
102    # Speculation indicators
103    if any(w in lower for w in ['might', 'could potentially', 'it is possible', 'may eventually',
104                                 'hypothetically', 'speculatively']):
105        return 'speculation'
106 
107    # Synthesis indicators (often in synthesis/conclusion sections)
108    if section_id in ('synthesis', 'conclusion', 'limitations'):
109        if any(w in lower for w in ['overall', 'taken together', 'collectively',
110                                     'the evidence suggests', 'this implies']):
111            return 'synthesis'
112 
113    # Default: factual
114    return 'factual'
115 
116 
117def parse_sections(markdown: str) -> list[tuple[str, str]]:
118    """Parse markdown into (section_id, content) pairs."""
119    lines = markdown.split('\n')
120    sections = []
121    current_id = 'preamble'
122    current_lines = []
123 
124    for line in lines:
125        matched = False
126        for pattern, id_or_fn in SECTION_PATTERNS:
127            m = pattern.match(line)
128            if m:
129                if current_lines:
130                    sections.append((current_id, '\n'.join(current_lines)))
131                current_id = id_or_fn(m) if callable(id_or_fn) else id_or_fn
132                current_lines = []
133                matched = True
134                break
135        if not matched:
136            current_lines.append(line)
137 
138    if current_lines:
139        sections.append((current_id, '\n'.join(current_lines)))
140 
141    return sections
142 
143 
144def extract_sentences(text: str) -> list[str]:
145    """Split text into sentences, filtering noise."""
146    # Remove markdown formatting noise
147    text = re.sub(r'^[-*]\s+', '', text, flags=re.M)  # bullet points
148    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # bold
149    text = re.sub(r'\*([^*]+)\*', r'\1', text)  # italic
150 
151    sentences = SENTENCE_RE.split(text)
152    result = []
153    for s in sentences:
154        s = s.strip()
155        # Filter out very short fragments, headings, empty lines
156        if len(s) > 30 and not s.startswith('#') and not s.startswith('|'):
157            result.append(s)
158    return result
159 
160 
161# ---------------------------------------------------------------------------
162# Subcommands
163# ---------------------------------------------------------------------------
164 
165def cmd_extract(args: argparse.Namespace) -> None:
166    """Extract atomic claims from a markdown report."""
167    report_path = args.report
168    if not os.path.exists(report_path):
169        print(json.dumps({'error': f'Report not found: {report_path}'}), file=sys.stderr)
170        sys.exit(1)
171 
172    with open(report_path) as f:
173        markdown = f.read()
174 
175    claims_path = os.path.join(args.dir, 'claims.jsonl')
176    existing_ids = {r['claim_id'] for r in read_jsonl(claims_path)}
177 
178    sections = parse_sections(markdown)
179    added = 0
180    skipped = 0
181 
182    for section_id, content in sections:
183        if section_id == 'preamble':
184            continue
185        sentences = extract_sentences(content)
186        for sentence in sentences:
187            claim_id = compute_claim_id(section_id, sentence)
188            if claim_id in existing_ids:
189                skipped += 1
190                continue
191 
192            # Extract citation numbers from sentence
193            citation_nums = []
194            for m in CITATION_RE.finditer(sentence):
195                nums = [int(n.strip()) for n in m.group(1).split(',')]
196                citation_nums.extend(nums)
197 
198            claim = {
199                'claim_id': claim_id,
200                'section_id': section_id,
201                'text': sentence,
202                'claim_type': classify_claim(sentence, section_id),
203                'cited_source_ids': [],  # Populated by linking step
204                'evidence_ids': [],  # Populated by verify_claim_support
205                'support_status': 'unverified',
206                'extracted_at': datetime.now(timezone.utc).isoformat(),
207                '_citation_numbers': citation_nums,  # Temporary, for linking
208            }
209            append_jsonl(claims_path, claim)
210            existing_ids.add(claim_id)
211            added += 1
212 
213    print(json.dumps({
214        'status': 'ok',
215        'claims_added': added,
216        'claims_skipped': skipped,
217        'total_claims': len(existing_ids),
218    }))
219 
220 
221def cmd_add(args: argparse.Namespace) -> None:
222    """Manually add a single claim."""
223    data = json.loads(args.json)
224    section_id = data.get('section_id', 'unknown')
225    text = data.get('text', '')
226    if not text:
227        print(json.dumps({'error': 'text is required'}), file=sys.stderr)
228        sys.exit(1)
229 
230    claim_id = compute_claim_id(section_id, text)
231    claims_path = os.path.join(args.dir, 'claims.jsonl')
232 
233    existing = read_jsonl(claims_path)
234    for row in existing:
235        if row.get('claim_id') == claim_id:
236            print(json.dumps({'status': 'duplicate', 'claim_id': claim_id}))
237            return
238 
239    valid_types = {'factual', 'synthesis', 'recommendation', 'speculation'}
240    claim_type = data.get('claim_type', 'factual')
241    if claim_type not in valid_types:
242        claim_type = 'factual'
243 
244    claim = {
245        'claim_id': claim_id,
246        'section_id': section_id,
247        'text': text,
248        'claim_type': claim_type,
249        'cited_source_ids': data.get('cited_source_ids', []),
250        'evidence_ids': data.get('evidence_ids', []),
251        'support_status': 'unverified',
252        'extracted_at': datetime.now(timezone.utc).isoformat(),
253    }
254    append_jsonl(claims_path, claim)
255    print(json.dumps({'status': 'added', 'claim_id': claim_id}))
256 
257 
258def cmd_list(args: argparse.Namespace) -> None:
259    """List claims with optional filters."""
260    claims_path = os.path.join(args.dir, 'claims.jsonl')
261    rows = read_jsonl(claims_path)
262 
263    if args.section:
264        rows = [r for r in rows if r.get('section_id') == args.section]
265    if args.type:
266        rows = [r for r in rows if r.get('claim_type') == args.type]
267    if args.status:
268        rows = [r for r in rows if r.get('support_status') == args.status]
269 
270    # Deduplicate
271    seen = set()
272    unique = []
273    for r in rows:
274        cid = r.get('claim_id')
275        if cid not in seen:
276            seen.add(cid)
277            unique.append(r)
278 
279    print(json.dumps({'count': len(unique), 'claims': unique}, indent=2, ensure_ascii=False))
280 
281 
282def cmd_stats(args: argparse.Namespace) -> None:
283    """Show claim statistics."""
284    claims_path = os.path.join(args.dir, 'claims.jsonl')
285    rows = read_jsonl(claims_path)
286 
287    # Deduplicate
288    seen = set()
289    unique = []
290    for r in rows:
291        cid = r.get('claim_id')
292        if cid not in seen:
293            seen.add(cid)
294            unique.append(r)
295 
296    by_type = {}
297    by_status = {}
298    by_section = {}
299    for r in unique:
300        t = r.get('claim_type', 'unknown')
301        s = r.get('support_status', 'unknown')
302        sec = r.get('section_id', 'unknown')
303        by_type[t] = by_type.get(t, 0) + 1
304        by_status[s] = by_status.get(s, 0) + 1
305        by_section[sec] = by_section.get(sec, 0) + 1
306 
307    print(json.dumps({
308        'total': len(unique),
309        'by_type': by_type,
310        'by_status': by_status,
311        'by_section': by_section,
312    }, indent=2))
313 
314 
315# ---------------------------------------------------------------------------
316# CLI entry point
317# ---------------------------------------------------------------------------
318 
319def main() -> None:
320    parser = argparse.ArgumentParser(
321        prog='extract_claims',
322        description='Atomic claim extraction and ledger for deep-research v3.0',
323    )
324    sub = parser.add_subparsers(dest='command', required=True)
325 
326    # extract
327    p_ext = sub.add_parser('extract', help='Extract claims from markdown report')
328    p_ext.add_argument('--report', required=True, help='Path to report.md')
329    p_ext.add_argument('--dir', required=True, help='Run directory containing claims.jsonl')
330 
331    # add
332    p_add = sub.add_parser('add', help='Manually add a single claim')
333    p_add.add_argument('--json', required=True, help='JSON with section_id, text, claim_type')
334    p_add.add_argument('--dir', required=True, help='Run directory')
335 
336    # list
337    p_list = sub.add_parser('list', help='List claims')
338    p_list.add_argument('--dir', required=True, help='Run directory')
339    p_list.add_argument('--section', default=None, help='Filter by section_id')
340    p_list.add_argument('--type', default=None, help='Filter by claim_type')
341    p_list.add_argument('--status', default=None, help='Filter by support_status')
342 
343    # stats
344    p_stats = sub.add_parser('stats', help='Claim statistics')
345    p_stats.add_argument('--dir', required=True, help='Run directory')
346 
347    args = parser.parse_args()
348    dispatch = {
349        'extract': cmd_extract,
350        'add': cmd_add,
351        'list': cmd_list,
352        'stats': cmd_stats,
353    }
354    dispatch[args.command](args)
355 
356 
357if __name__ == '__main__':
358    main()
359
Preparing the source view

Deep Research

scripts/extract_claims.py