Source from repo
Deep Research

Enterprise-grade research with multi-source synthesis, citation tracking, and verification. 8-phase pipeline with auto-continuation.
199-biotechnologiesGitHub 199-biotechnologiesSource repo Original GitHub link
Files
Skill
n/a
Size
221.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/md_to_html.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code331 linesFree
scripts/md_to_html.py
1#!/usr/bin/env python3
2"""
3Markdown to HTML converter for research reports
4Properly converts markdown sections to HTML while preserving structure and formatting
5"""
6 
7import re
8from typing import Tuple
9from pathlib import Path
10 
11 
12def convert_markdown_to_html(markdown_text: str) -> Tuple[str, str]:
13    """
14    Convert markdown to HTML in two parts: content and bibliography
15 
16    Args:
17        markdown_text: Full markdown report text
18 
19    Returns:
20        Tuple of (content_html, bibliography_html)
21    """
22    # Split content and bibliography
23    parts = markdown_text.split('## Bibliography')
24    content_md = parts[0]
25    bibliography_md = parts[1] if len(parts) > 1 else ""
26 
27    # Convert content (everything except bibliography)
28    content_html = _convert_content_section(content_md)
29 
30    # Convert bibliography separately
31    bibliography_html = _convert_bibliography_section(bibliography_md)
32 
33    return content_html, bibliography_html
34 
35 
36def _convert_content_section(markdown: str) -> str:
37    """Convert main content sections to HTML"""
38    html = markdown
39 
40    # Remove title and front matter (first ## heading is handled separately)
41    lines = html.split('\n')
42    processed_lines = []
43    skip_until_first_section = True
44 
45    for line in lines:
46        # Skip everything until we hit "## Executive Summary" or first major section
47        if skip_until_first_section:
48            if line.startswith('## ') and not line.startswith('### '):
49                skip_until_first_section = False
50                processed_lines.append(line)
51            continue
52        processed_lines.append(line)
53 
54    html = '\n'.join(processed_lines)
55 
56    # Convert headers
57    # ## Section Title → <div class="section"><h2 class="section-title">Section Title</h2></div>
58    html = re.sub(
59        r'^## (.+)$',
60        r'<div class="section"><h2 class="section-title">\1</h2>',
61        html,
62        flags=re.MULTILINE
63    )
64 
65    # ### Subsection → <h3 class="subsection-title">Subsection</h3>
66    html = re.sub(
67        r'^### (.+)$',
68        r'<h3 class="subsection-title">\1</h3>',
69        html,
70        flags=re.MULTILINE
71    )
72 
73    # #### Subsubsection → <h4 class="subsubsection-title">Title</h4>
74    html = re.sub(
75        r'^#### (.+)$',
76        r'<h4 class="subsubsection-title">\1</h4>',
77        html,
78        flags=re.MULTILINE
79    )
80 
81    # Convert **bold** text
82    html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
83 
84    # Convert *italic* text
85    html = re.sub(r'\*(.+?)\*', r'<em>\1</em>', html)
86 
87    # Convert inline code `code`
88    html = re.sub(r'`(.+?)`', r'<code>\1</code>', html)
89 
90    # Convert unordered lists
91    html = _convert_lists(html)
92 
93    # Convert tables
94    html = _convert_tables(html)
95 
96    # Convert paragraphs (wrap non-HTML lines in <p> tags)
97    html = _convert_paragraphs(html)
98 
99    # Close all open sections
100    html = _close_sections(html)
101 
102    # Wrap executive summary if present
103    html = html.replace(
104        '<h2 class="section-title">Executive Summary</h2>',
105        '<div class="executive-summary"><h2 class="section-title">Executive Summary</h2>'
106    )
107    if '<div class="executive-summary">' in html:
108        # Close executive summary at the next section
109        html = html.replace(
110            '</h2>\n<div class="section">',
111            '</h2></div>\n<div class="section">',
112            1
113        )
114 
115    return html
116 
117 
118def _convert_bibliography_section(markdown: str) -> str:
119    """Convert bibliography section to HTML"""
120    if not markdown.strip():
121        return ""
122 
123    html = markdown
124 
125    # Convert each [N] citation to a proper bibliography entry
126    # Look for patterns like [1] Title - URL
127    html = re.sub(
128        r'\[(\d+)\]\s*(.+?)\s*-\s*(https?://[^\s\)]+)',
129        r'<div class="bib-entry"><span class="bib-number">[\1]</span> <a href="\3" target="_blank">\2</a></div>',
130        html
131    )
132 
133    # Convert any remaining **bold** sections
134    html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
135 
136    # Wrap in bibliography content div
137    html = f'<div class="bibliography-content">{html}</div>'
138 
139    return html
140 
141 
142def _convert_lists(html: str) -> str:
143    """Convert markdown lists to HTML lists"""
144    lines = html.split('\n')
145    result = []
146    in_list = False
147    list_level = 0
148 
149    for i, line in enumerate(lines):
150        stripped = line.strip()
151 
152        # Check for unordered list item
153        if stripped.startswith('- ') or stripped.startswith('* '):
154            if not in_list:
155                result.append('<ul>')
156                in_list = True
157                list_level = len(line) - len(line.lstrip())
158 
159            # Get the content after the marker
160            content = stripped[2:]
161            result.append(f'<li>{content}</li>')
162 
163        # Check for ordered list item
164        elif re.match(r'^\d+\.\s', stripped):
165            if not in_list:
166                result.append('<ol>')
167                in_list = True
168                list_level = len(line) - len(line.lstrip())
169 
170            # Get the content after the number and period
171            content = re.sub(r'^\d+\.\s', '', stripped)
172            result.append(f'<li>{content}</li>')
173 
174        else:
175            # Not a list item
176            if in_list:
177                # Check if we're still in the list (indented continuation)
178                current_level = len(line) - len(line.lstrip())
179                if current_level > list_level and stripped:
180                    # Continuation of previous list item
181                    if result[-1].endswith('</li>'):
182                        result[-1] = result[-1][:-5] + ' ' + stripped + '</li>'
183                    continue
184                else:
185                    # End of list
186                    result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
187                    in_list = False
188                    list_level = 0
189 
190            result.append(line)
191 
192    # Close any remaining open list
193    if in_list:
194        result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
195 
196    return '\n'.join(result)
197 
198 
199def _convert_tables(html: str) -> str:
200    """Convert markdown tables to HTML tables"""
201    lines = html.split('\n')
202    result = []
203    in_table = False
204 
205    for i, line in enumerate(lines):
206        if '|' in line and line.strip().startswith('|'):
207            if not in_table:
208                result.append('<table>')
209                in_table = True
210                # This is the header row
211                cells = [cell.strip() for cell in line.split('|')[1:-1]]
212                result.append('<thead><tr>')
213                for cell in cells:
214                    result.append(f'<th>{cell}</th>')
215                result.append('</tr></thead>')
216                result.append('<tbody>')
217            elif '---' in line:
218                # Skip separator row
219                continue
220            else:
221                # Data row
222                cells = [cell.strip() for cell in line.split('|')[1:-1]]
223                result.append('<tr>')
224                for cell in cells:
225                    result.append(f'<td>{cell}</td>')
226                result.append('</tr>')
227        else:
228            if in_table:
229                result.append('</tbody></table>')
230                in_table = False
231            result.append(line)
232 
233    if in_table:
234        result.append('</tbody></table>')
235 
236    return '\n'.join(result)
237 
238 
239def _convert_paragraphs(html: str) -> str:
240    """Wrap non-HTML lines in paragraph tags"""
241    lines = html.split('\n')
242    result = []
243    in_paragraph = False
244 
245    for line in lines:
246        stripped = line.strip()
247 
248        # Skip empty lines
249        if not stripped:
250            if in_paragraph:
251                result.append('</p>')
252                in_paragraph = False
253            result.append(line)
254            continue
255 
256        # Skip lines that are already HTML tags
257        if (stripped.startswith('<') and stripped.endswith('>')) or \
258           stripped.startswith('</') or \
259           '<h' in stripped or '<div' in stripped or '<ul' in stripped or \
260           '<ol' in stripped or '<li' in stripped or '<table' in stripped or \
261           '</div>' in stripped or '</ul>' in stripped or '</ol>' in stripped:
262            if in_paragraph:
263                result.append('</p>')
264                in_paragraph = False
265            result.append(line)
266            continue
267 
268        # Regular text line - wrap in paragraph
269        if not in_paragraph:
270            result.append('<p>' + line)
271            in_paragraph = True
272        else:
273            result.append(line)
274 
275    if in_paragraph:
276        result.append('</p>')
277 
278    return '\n'.join(result)
279 
280 
281def _close_sections(html: str) -> str:
282    """Close all open section divs"""
283    # Count open and closed divs
284    open_divs = html.count('<div class="section">')
285    closed_divs = html.count('</div>')
286 
287    # Add closing divs for sections
288    # Each section should be closed before the next section starts
289    lines = html.split('\n')
290    result = []
291    section_open = False
292 
293    for i, line in enumerate(lines):
294        if '<div class="section">' in line:
295            if section_open:
296                result.append('</div>')  # Close previous section
297            section_open = True
298        result.append(line)
299 
300    # Close final section if still open
301    if section_open:
302        result.append('</div>')
303 
304    return '\n'.join(result)
305 
306 
307def main():
308    """Test the converter with a sample markdown file"""
309    import sys
310 
311    if len(sys.argv) < 2:
312        print("Usage: python md_to_html.py <markdown_file>")
313        sys.exit(1)
314 
315    md_file = Path(sys.argv[1])
316    if not md_file.exists():
317        print(f"Error: File {md_file} not found")
318        sys.exit(1)
319 
320    markdown_text = md_file.read_text()
321    content_html, bib_html = convert_markdown_to_html(markdown_text)
322 
323    print("=== CONTENT HTML ===")
324    print(content_html[:1000])
325    print("\n=== BIBLIOGRAPHY HTML ===")
326    print(bib_html[:500])
327 
328 
329if __name__ == "__main__":
330    main()
331
Preparing the source view

Deep Research

scripts/md_to_html.py