Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Enterprise-grade research with multi-source synthesis, citation tracking, and verification. 8-phase pipeline with auto-continuation.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/verify_html.py
1#!/usr/bin/env python32"""3HTML Report Verification Script4Validates that HTML reports are properly generated with all sections from MD5"""67import argparse8import re9from pathlib import Path10from typing import List, Tuple111213class HTMLVerifier:14"""Verify HTML research reports"""1516def __init__(self, html_path: Path, md_path: Path):17self.html_path = html_path18self.md_path = md_path19self.errors = []20self.warnings = []2122def verify(self) -> bool:23"""24Run all verification checks2526Returns:27True if all checks pass, False otherwise28"""29print(f"\n{'='*60}")30print(f"HTML REPORT VERIFICATION")31print(f"{'='*60}\n")3233print(f"HTML File: {self.html_path}")34print(f"MD File: {self.md_path}\n")3536# Read files37try:38html_content = self.html_path.read_text()39md_content = self.md_path.read_text()40except Exception as e:41self.errors.append(f"Failed to read files: {e}")42return False4344# Run checks45self._check_sections(html_content, md_content)46self._check_no_placeholders(html_content)47self._check_no_emojis(html_content)48self._check_structure(html_content)49self._check_citations(html_content, md_content)50self._check_bibliography(html_content, md_content)5152# Report results53self._print_results()5455return len(self.errors) == 05657def _check_sections(self, html: str, md: str):58"""Verify all markdown sections are present in HTML"""59# Extract section headings from markdown60md_sections = re.findall(r'^## (.+)$', md, re.MULTILINE)6162# Extract sections from HTML63html_sections = re.findall(r'<h2 class="section-title">(.+?)</h2>', html)6465# Check if we have placeholder sections like <div class="section">#</div>66placeholder_sections = re.findall(r'<div class="section">#</div>', html)6768if placeholder_sections:69self.errors.append(70f"Found {len(placeholder_sections)} placeholder sections (empty '#' divs) - content not converted properly"71)7273# Compare section counts74if len(md_sections) > len(html_sections) + 1: # +1 for bibliography which is separate75self.errors.append(76f"Section count mismatch: MD has {len(md_sections)} sections, HTML has only {len(html_sections)} + bibliography"77)78missing = set(md_sections) - set(html_sections)79if missing:80self.errors.append(f"Missing sections in HTML: {missing}")8182# Verify Executive Summary is present83if "Executive Summary" in md and "Executive Summary" not in html:84self.errors.append("Executive Summary missing from HTML")8586def _check_no_placeholders(self, html: str):87"""Check for common placeholders that shouldn't be in final report"""88placeholders = [89'{{TITLE}}', '{{DATE}}', '{{CONTENT}}', '{{BIBLIOGRAPHY}}',90'{{METRICS_DASHBOARD}}', '{{SOURCE_COUNT}}', 'TODO', 'TBD',91'PLACEHOLDER', 'FIXME'92]9394found = []95for placeholder in placeholders:96if placeholder in html:97found.append(placeholder)9899if found:100self.errors.append(f"Found unreplaced placeholders: {', '.join(found)}")101102def _check_no_emojis(self, html: str):103"""Verify no emojis are present in HTML"""104# Common emoji patterns105emoji_pattern = re.compile(106"["107"\U0001F600-\U0001F64F" # emoticons108"\U0001F300-\U0001F5FF" # symbols & pictographs109"\U0001F680-\U0001F6FF" # transport & map symbols110"\U0001F1E0-\U0001F1FF" # flags111"\U00002702-\U000027B0"112"\U000024C2-\U0001F251"113"]+",114flags=re.UNICODE115)116117emojis = emoji_pattern.findall(html)118if emojis:119unique_emojis = set(emojis)120self.errors.append(f"Found {len(emojis)} emojis in HTML (should be none): {unique_emojis}")121122def _check_structure(self, html: str):123"""Verify HTML has proper structure"""124required_elements = [125('<html', 'HTML tag'),126('<head', 'head tag'),127('<body', 'body tag'),128('<title>', 'title tag'),129('class="header"', 'header section'),130('class="content"', 'content section'),131('class="bibliography"', 'bibliography section'),132]133134for element, name in required_elements:135if element not in html:136self.errors.append(f"Missing {name} in HTML")137138# Check for unclosed tags (basic check)139open_divs = html.count('<div')140close_divs = html.count('</div>')141142if abs(open_divs - close_divs) > 2: # Allow small discrepancy143self.warnings.append(144f"Possible unclosed divs: {open_divs} opening tags, {close_divs} closing tags"145)146147def _check_citations(self, html: str, md: str):148"""Verify citations are present"""149# Extract citations from markdown150md_citations = set(re.findall(r'\[(\d+)\]', md))151152# Extract citations from HTML (excluding bibliography)153html_content = html.split('class="bibliography"')[0] if 'class="bibliography"' in html else html154html_citations = set(re.findall(r'\[(\d+)\]', html_content))155156if len(md_citations) > 0 and len(html_citations) == 0:157self.errors.append("No citations found in HTML content (but present in MD)")158159if len(md_citations) > len(html_citations) * 1.5: # Allow some variation160self.warnings.append(161f"Fewer citations in HTML ({len(html_citations)}) than MD ({len(md_citations)})"162)163164def _check_bibliography(self, html: str, md: str):165"""Verify bibliography is present and formatted"""166if '## Bibliography' in md:167if 'class="bibliography"' not in html:168self.errors.append("Bibliography section missing from HTML")169elif 'class="bib-entry"' not in html:170self.warnings.append("Bibliography present but entries not properly formatted")171172def _print_results(self):173"""Print verification results"""174print(f"\n{'-'*60}")175print("VERIFICATION RESULTS")176print(f"{'-'*60}\n")177178if self.errors:179print(f"❌ ERRORS ({len(self.errors)}):")180for i, error in enumerate(self.errors, 1):181print(f" {i}. {error}")182print()183184if self.warnings:185print(f"⚠️ WARNINGS ({len(self.warnings)}):")186for i, warning in enumerate(self.warnings, 1):187print(f" {i}. {warning}")188print()189190if not self.errors and not self.warnings:191print("✅ All checks passed! HTML report is valid.")192print()193194print(f"{'-'*60}\n")195196197def main():198"""Main entry point"""199parser = argparse.ArgumentParser(description='Verify HTML research report')200parser.add_argument('--html', type=Path, required=True, help='Path to HTML report')201parser.add_argument('--md', type=Path, required=True, help='Path to markdown report')202203args = parser.parse_args()204205if not args.html.exists():206print(f"Error: HTML file not found: {args.html}")207return 1208209if not args.md.exists():210print(f"Error: Markdown file not found: {args.md}")211return 1212213verifier = HTMLVerifier(args.html, args.md)214success = verifier.verify()215216return 0 if success else 1217218219if __name__ == "__main__":220exit(main())221