Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Enterprise-grade research with multi-source synthesis, citation tracking, and verification. 8-phase pipeline with auto-continuation.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/verify_citations.py
1#!/usr/bin/env python32"""3Citation Verification Script45Catches fabricated citations by checking:61. DOI resolution (via doi.org)72. Basic metadata matching (title similarity, year match)83. URL accessibility verification94. Hallucination pattern detection (generic titles, suspicious patterns)105. Flags suspicious entries for manual review1112Usage:13python verify_citations.py --report [path]14python verify_citations.py --report [path] --strict # Fail on any unverified1516Does NOT require API keys - uses free DOI resolver and heuristics.17"""1819import sys20import argparse21import re22from pathlib import Path23from typing import List, Dict, Tuple24from urllib import request, error25from urllib.parse import quote26import json27import time28from datetime import datetime2930class CitationVerifier:31"""Verify citations in research report"""3233def __init__(self, report_path: Path, strict_mode: bool = False):34self.report_path = report_path35self.strict_mode = strict_mode36self.content = self._read_report()37self.suspicious = []38self.verified = []39self.errors = []4041# Hallucination detection patterns (2025 CiteGuard enhancement)42self.suspicious_patterns = [43# Generic academic-sounding but fake patterns44(r'^(A |An |The )?(Study|Analysis|Review|Survey|Investigation) (of|on|into)',45"Generic academic title pattern"),46(r'^(Recent|Current|Modern|Contemporary) (Advances|Developments|Trends) in',47"Generic 'advances' title pattern"),48# Too perfect, templated titles49(r'^[A-Z][a-z]+ [A-Z][a-z]+: A (Comprehensive|Complete|Systematic) (Review|Analysis|Guide)$',50"Too perfect, templated structure"),51]5253def _read_report(self) -> str:54"""Read report file"""55try:56with open(self.report_path, 'r', encoding='utf-8') as f:57return f.read()58except Exception as e:59print(f"L ERROR: Cannot read report: {e}")60sys.exit(1)6162def extract_bibliography(self) -> List[Dict]:63"""Extract bibliography entries from report"""64pattern = r'## Bibliography(.*?)(?=##|\Z)'65match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)6667if not match:68self.errors.append("No Bibliography section found")69return []7071bib_section = match.group(1)7273# Parse entries: [N] Author (Year). "Title". Venue. URL74entries = []75lines = bib_section.strip().split('\n')7677current_entry = None78for line in lines:79line = line.strip()80if not line:81continue8283# Check if starts with citation number [N]84match_num = re.match(r'^\[(\d+)\]\s+(.+)$', line)85if match_num:86if current_entry:87entries.append(current_entry)8889num = match_num.group(1)90rest = match_num.group(2)9192# Try to parse: Author (Year). "Title". Venue. URL93year_match = re.search(r'\((\d{4})\)', rest)94title_match = re.search(r'"([^"]+)"', rest)95doi_match = re.search(r'doi\.org/(10\.\S+)', rest)96url_match = re.search(r'https?://[^\s\)]+', rest)9798current_entry = {99'num': num,100'raw': rest,101'year': year_match.group(1) if year_match else None,102'title': title_match.group(1) if title_match else None,103'doi': doi_match.group(1) if doi_match else None,104'url': url_match.group(0) if url_match else None105}106elif current_entry:107# Multi-line entry, append to raw108current_entry['raw'] += ' ' + line109110if current_entry:111entries.append(current_entry)112113return entries114115def verify_doi(self, doi: str) -> Tuple[bool, Dict]:116"""117Verify DOI exists and get metadata.118Returns (success, metadata_dict)119"""120if not doi:121return False, {}122123try:124# Use content negotiation to get JSON metadata125url = f"https://doi.org/{quote(doi)}"126req = request.Request(url)127req.add_header('Accept', 'application/vnd.citationstyles.csl+json')128129with request.urlopen(req, timeout=10) as response:130data = json.loads(response.read().decode('utf-8'))131132return True, {133'title': data.get('title', ''),134'year': data.get('issued', {}).get('date-parts', [[None]])[0][0],135'authors': [136f"{a.get('family', '')} {a.get('given', '')}"137for a in data.get('author', [])138],139'venue': data.get('container-title', '')140}141except error.HTTPError as e:142if e.code == 404:143return False, {'error': 'DOI not found (404)'}144return False, {'error': f'HTTP {e.code}'}145except Exception as e:146return False, {'error': str(e)}147148def verify_url(self, url: str) -> Tuple[bool, str]:149"""150Verify URL is accessible (2025 CiteGuard enhancement).151Returns (accessible, status_message)152"""153if not url:154return False, "No URL"155156try:157# HEAD request to check accessibility without downloading158req = request.Request(url, method='HEAD')159req.add_header('User-Agent', 'Mozilla/5.0 (Research Citation Verifier)')160161with request.urlopen(req, timeout=10) as response:162if response.status == 200:163return True, "URL accessible"164else:165return False, f"HTTP {response.status}"166except error.HTTPError as e:167return False, f"HTTP {e.code}"168except error.URLError as e:169return False, f"URL error: {e.reason}"170except Exception as e:171return False, f"Connection error: {str(e)[:50]}"172173def detect_hallucination_patterns(self, entry: Dict) -> List[str]:174"""175Detect common LLM hallucination patterns in citations (2025 CiteGuard).176Returns list of detected issues.177"""178issues = []179title = entry.get('title', '')180181if not title:182return issues183184# Check against suspicious patterns185for pattern, description in self.suspicious_patterns:186if re.match(pattern, title, re.IGNORECASE):187issues.append(f"Suspicious title pattern: {description}")188189# Check for overly generic titles190generic_words = ['overview', 'introduction', 'guide', 'handbook', 'manual']191if any(word in title.lower() for word in generic_words) and len(title.split()) < 5:192issues.append("Very generic short title")193194# Check for placeholder-like titles195if any(x in title.lower() for x in ['tbd', 'todo', 'placeholder', 'example']):196issues.append("Placeholder text in title")197198# Check for inconsistent metadata199if entry.get('year'):200year = int(entry['year'])201current_year = datetime.now().year202# Very recent without DOI or URL is suspicious203if year >= current_year - 1 and not entry.get('doi') and not entry.get('url'):204issues.append(f"Recent year ({year}) with no verification method")205# Future year is definitely wrong206if year > current_year:207issues.append(f"Future year: {year} (current: {current_year})")208# Very old with modern phrasing is suspicious209if year < 2000 and any(word in title.lower() for word in ['ai', 'llm', 'gpt', 'transformer']):210issues.append(f"Anachronistic: pre-2000 ({year}) citation mentioning modern AI terms")211212return issues213214def check_title_similarity(self, title1: str, title2: str) -> float:215"""216Simple title similarity check (word overlap).217Returns score 0.0-1.0218"""219if not title1 or not title2:220return 0.0221222# Normalize: lowercase, remove punctuation, split223def normalize(s):224s = s.lower()225s = re.sub(r'[^\w\s]', ' ', s)226return set(s.split())227228words1 = normalize(title1)229words2 = normalize(title2)230231if not words1 or not words2:232return 0.0233234overlap = len(words1 & words2)235total = len(words1 | words2)236237return overlap / total if total > 0 else 0.0238239def verify_entry(self, entry: Dict) -> Dict:240"""Verify a single bibliography entry (Enhanced 2025 with CiteGuard)"""241result = {242'num': entry['num'],243'status': 'unknown',244'issues': [],245'metadata': {},246'verification_methods': []247}248249# STEP 1: Run hallucination detection (CiteGuard 2025)250hallucination_issues = self.detect_hallucination_patterns(entry)251if hallucination_issues:252result['issues'].extend(hallucination_issues)253result['status'] = 'suspicious'254255# STEP 2: Has DOI?256if entry['doi']:257print(f" [{entry['num']}] Checking DOI {entry['doi']}...", end=' ')258success, metadata = self.verify_doi(entry['doi'])259260if success:261result['metadata'] = metadata262result['status'] = 'verified'263print("")264265# Check title similarity if we have both266if entry['title'] and metadata.get('title'):267similarity = self.check_title_similarity(268entry['title'],269metadata['title']270)271272if similarity < 0.5:273result['issues'].append(274f"Title mismatch (similarity: {similarity:.1%})"275)276result['status'] = 'suspicious'277278# Check year match279if entry['year'] and metadata.get('year'):280if int(entry['year']) != int(metadata['year']):281result['issues'].append(282f"Year mismatch: report says {entry['year']}, DOI says {metadata['year']}"283)284result['status'] = 'suspicious'285286else:287print(f"✗ {metadata.get('error', 'Failed')}")288result['status'] = 'unverified'289result['issues'].append(f"DOI resolution failed: {metadata.get('error', 'unknown')}")290291# STEP 3: Check URL accessibility (if no DOI or DOI failed)292if entry['url'] and result['status'] != 'verified':293url_ok, url_status = self.verify_url(entry['url'])294if url_ok:295result['verification_methods'].append('URL')296# Upgrade status if URL verifies297if result['status'] in ['unknown', 'no_doi', 'unverified']:298result['status'] = 'url_verified'299print(f" [{entry['num']}] URL accessible ✓")300else:301result['issues'].append(f"URL check failed: {url_status}")302303# STEP 4: Final fallback - no verification method304if not entry['doi'] and not entry['url']:305if 'No DOI provided' not in ' '.join(result['issues']):306result['issues'].append("No DOI or URL - cannot verify")307result['status'] = 'suspicious'308309return result310311def verify_all(self):312"""Verify all bibliography entries"""313print(f"\n{'='*60}")314print(f"CITATION VERIFICATION: {self.report_path.name}")315print(f"{'='*60}\n")316317entries = self.extract_bibliography()318319if not entries:320print("L No bibliography entries found\n")321return False322323print(f"Found {len(entries)} citations\n")324325results = []326for entry in entries:327result = self.verify_entry(entry)328results.append(result)329330# Rate limiting331time.sleep(0.5)332333# Summarize334print(f"\n{'='*60}")335print(f"VERIFICATION SUMMARY")336print(f"{'='*60}\n")337338verified = [r for r in results if r['status'] == 'verified']339url_verified = [r for r in results if r['status'] == 'url_verified']340suspicious = [r for r in results if r['status'] == 'suspicious']341unverified = [r for r in results if r['status'] in ['unverified', 'no_doi', 'unknown']]342343print(f'DOI Verified: {len(verified)}/{len(results)}')344print(f'URL Verified: {len(url_verified)}/{len(results)}')345print(f'Suspicious: {len(suspicious)}/{len(results)}')346print(f'Unverified: {len(unverified)}/{len(results)}')347print()348349if suspicious:350print('SUSPICIOUS CITATIONS (Manual Review Needed):')351for r in suspicious:352print(f"\n [{r['num']}]")353for issue in r['issues']:354print(f" - {issue}")355print()356357if unverified and len(unverified) > 0:358print('UNVERIFIED CITATIONS (Could not check):')359for r in unverified:360print(f" [{r['num']}] {r['issues'][0] if r['issues'] else 'Unknown'}")361print()362363# Decision (Enhanced 2025 - includes URL-verified as acceptable)364total_verified = len(verified) + len(url_verified)365366if suspicious:367print('WARNING: Suspicious citations detected')368if self.strict_mode:369print(' STRICT MODE: Failing due to suspicious citations')370return False371else:372print(' (Continuing in non-strict mode)')373374if self.strict_mode and unverified:375print('STRICT MODE: Unverified citations found')376return False377378if total_verified / len(results) < 0.5:379print('WARNING: Less than 50% citations verified')380return True # Pass with warning381else:382print('CITATION VERIFICATION PASSED')383return True384385386def main():387parser = argparse.ArgumentParser(388description="Verify citations in research report",389formatter_class=argparse.RawDescriptionHelpFormatter,390epilog="""391Examples:392python verify_citations.py --report report.md393394Note: Requires internet connection to check DOIs.395Uses free DOI resolver - no API key needed.396"""397)398399parser.add_argument(400'--report', '-r',401type=str,402required=True,403help='Path to research report markdown file'404)405406parser.add_argument(407'--strict',408action='store_true',409help='Strict mode: fail on any unverified or suspicious citations'410)411412args = parser.parse_args()413report_path = Path(args.report)414415if not report_path.exists():416print(f"ERROR: Report file not found: {report_path}")417sys.exit(1)418419verifier = CitationVerifier(report_path, strict_mode=args.strict)420passed = verifier.verify_all()421422sys.exit(0 if passed else 1)423424425if __name__ == '__main__':426main()427