Source from repo
Deep Research

Enterprise-grade research with multi-source synthesis, citation tracking, and verification. 8-phase pipeline with auto-continuation.
199-biotechnologiesGitHub 199-biotechnologiesSource repo Original GitHub link
Files
Skill
n/a
Size
221.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/verify_html.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code221 linesFree
scripts/verify_html.py
1#!/usr/bin/env python3
2"""
3HTML Report Verification Script
4Validates that HTML reports are properly generated with all sections from MD
5"""
6 
7import argparse
8import re
9from pathlib import Path
10from typing import List, Tuple
11 
12 
13class HTMLVerifier:
14    """Verify HTML research reports"""
15 
16    def __init__(self, html_path: Path, md_path: Path):
17        self.html_path = html_path
18        self.md_path = md_path
19        self.errors = []
20        self.warnings = []
21 
22    def verify(self) -> bool:
23        """
24        Run all verification checks
25 
26        Returns:
27            True if all checks pass, False otherwise
28        """
29        print(f"\n{'='*60}")
30        print(f"HTML REPORT VERIFICATION")
31        print(f"{'='*60}\n")
32 
33        print(f"HTML File: {self.html_path}")
34        print(f"MD File: {self.md_path}\n")
35 
36        # Read files
37        try:
38            html_content = self.html_path.read_text()
39            md_content = self.md_path.read_text()
40        except Exception as e:
41            self.errors.append(f"Failed to read files: {e}")
42            return False
43 
44        # Run checks
45        self._check_sections(html_content, md_content)
46        self._check_no_placeholders(html_content)
47        self._check_no_emojis(html_content)
48        self._check_structure(html_content)
49        self._check_citations(html_content, md_content)
50        self._check_bibliography(html_content, md_content)
51 
52        # Report results
53        self._print_results()
54 
55        return len(self.errors) == 0
56 
57    def _check_sections(self, html: str, md: str):
58        """Verify all markdown sections are present in HTML"""
59        # Extract section headings from markdown
60        md_sections = re.findall(r'^## (.+)$', md, re.MULTILINE)
61 
62        # Extract sections from HTML
63        html_sections = re.findall(r'<h2 class="section-title">(.+?)</h2>', html)
64 
65        # Check if we have placeholder sections like <div class="section">#</div>
66        placeholder_sections = re.findall(r'<div class="section">#</div>', html)
67 
68        if placeholder_sections:
69            self.errors.append(
70                f"Found {len(placeholder_sections)} placeholder sections (empty '#' divs) - content not converted properly"
71            )
72 
73        # Compare section counts
74        if len(md_sections) > len(html_sections) + 1:  # +1 for bibliography which is separate
75            self.errors.append(
76                f"Section count mismatch: MD has {len(md_sections)} sections, HTML has only {len(html_sections)} + bibliography"
77            )
78            missing = set(md_sections) - set(html_sections)
79            if missing:
80                self.errors.append(f"Missing sections in HTML: {missing}")
81 
82        # Verify Executive Summary is present
83        if "Executive Summary" in md and "Executive Summary" not in html:
84            self.errors.append("Executive Summary missing from HTML")
85 
86    def _check_no_placeholders(self, html: str):
87        """Check for common placeholders that shouldn't be in final report"""
88        placeholders = [
89            '{{TITLE}}', '{{DATE}}', '{{CONTENT}}', '{{BIBLIOGRAPHY}}',
90            '{{METRICS_DASHBOARD}}', '{{SOURCE_COUNT}}', 'TODO', 'TBD',
91            'PLACEHOLDER', 'FIXME'
92        ]
93 
94        found = []
95        for placeholder in placeholders:
96            if placeholder in html:
97                found.append(placeholder)
98 
99        if found:
100            self.errors.append(f"Found unreplaced placeholders: {', '.join(found)}")
101 
102    def _check_no_emojis(self, html: str):
103        """Verify no emojis are present in HTML"""
104        # Common emoji patterns
105        emoji_pattern = re.compile(
106            "["
107            "\U0001F600-\U0001F64F"  # emoticons
108            "\U0001F300-\U0001F5FF"  # symbols & pictographs
109            "\U0001F680-\U0001F6FF"  # transport & map symbols
110            "\U0001F1E0-\U0001F1FF"  # flags
111            "\U00002702-\U000027B0"
112            "\U000024C2-\U0001F251"
113            "]+",
114            flags=re.UNICODE
115        )
116 
117        emojis = emoji_pattern.findall(html)
118        if emojis:
119            unique_emojis = set(emojis)
120            self.errors.append(f"Found {len(emojis)} emojis in HTML (should be none): {unique_emojis}")
121 
122    def _check_structure(self, html: str):
123        """Verify HTML has proper structure"""
124        required_elements = [
125            ('<html', 'HTML tag'),
126            ('<head', 'head tag'),
127            ('<body', 'body tag'),
128            ('<title>', 'title tag'),
129            ('class="header"', 'header section'),
130            ('class="content"', 'content section'),
131            ('class="bibliography"', 'bibliography section'),
132        ]
133 
134        for element, name in required_elements:
135            if element not in html:
136                self.errors.append(f"Missing {name} in HTML")
137 
138        # Check for unclosed tags (basic check)
139        open_divs = html.count('<div')
140        close_divs = html.count('</div>')
141 
142        if abs(open_divs - close_divs) > 2:  # Allow small discrepancy
143            self.warnings.append(
144                f"Possible unclosed divs: {open_divs} opening tags, {close_divs} closing tags"
145            )
146 
147    def _check_citations(self, html: str, md: str):
148        """Verify citations are present"""
149        # Extract citations from markdown
150        md_citations = set(re.findall(r'\[(\d+)\]', md))
151 
152        # Extract citations from HTML (excluding bibliography)
153        html_content = html.split('class="bibliography"')[0] if 'class="bibliography"' in html else html
154        html_citations = set(re.findall(r'\[(\d+)\]', html_content))
155 
156        if len(md_citations) > 0 and len(html_citations) == 0:
157            self.errors.append("No citations found in HTML content (but present in MD)")
158 
159        if len(md_citations) > len(html_citations) * 1.5:  # Allow some variation
160            self.warnings.append(
161                f"Fewer citations in HTML ({len(html_citations)}) than MD ({len(md_citations)})"
162            )
163 
164    def _check_bibliography(self, html: str, md: str):
165        """Verify bibliography is present and formatted"""
166        if '## Bibliography' in md:
167            if 'class="bibliography"' not in html:
168                self.errors.append("Bibliography section missing from HTML")
169            elif 'class="bib-entry"' not in html:
170                self.warnings.append("Bibliography present but entries not properly formatted")
171 
172    def _print_results(self):
173        """Print verification results"""
174        print(f"\n{'-'*60}")
175        print("VERIFICATION RESULTS")
176        print(f"{'-'*60}\n")
177 
178        if self.errors:
179            print(f"❌ ERRORS ({len(self.errors)}):")
180            for i, error in enumerate(self.errors, 1):
181                print(f"  {i}. {error}")
182            print()
183 
184        if self.warnings:
185            print(f"⚠️  WARNINGS ({len(self.warnings)}):")
186            for i, warning in enumerate(self.warnings, 1):
187                print(f"  {i}. {warning}")
188            print()
189 
190        if not self.errors and not self.warnings:
191            print("✅ All checks passed! HTML report is valid.")
192            print()
193 
194        print(f"{'-'*60}\n")
195 
196 
197def main():
198    """Main entry point"""
199    parser = argparse.ArgumentParser(description='Verify HTML research report')
200    parser.add_argument('--html', type=Path, required=True, help='Path to HTML report')
201    parser.add_argument('--md', type=Path, required=True, help='Path to markdown report')
202 
203    args = parser.parse_args()
204 
205    if not args.html.exists():
206        print(f"Error: HTML file not found: {args.html}")
207        return 1
208 
209    if not args.md.exists():
210        print(f"Error: Markdown file not found: {args.md}")
211        return 1
212 
213    verifier = HTMLVerifier(args.html, args.md)
214    success = verifier.verify()
215 
216    return 0 if success else 1
217 
218 
219if __name__ == "__main__":
220    exit(main())
221
Preparing the source view

Deep Research

scripts/verify_html.py