Source from repo
Web Content Fetcher — 网页正文提取

Extract clean Markdown content from any URL using a three-tier strategy: Jina Reader, Scrapling, or web_fetch.
shirenchuangGitHub shirenchuangSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
55.4 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/fetch.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code238 linesFree
scripts/fetch.py
1#!/usr/bin/env python3
2"""
3Universal web content extractor (Scrapling + html2text).
4Returns clean Markdown with headings, links, images, lists, and code blocks.
5 
6Usage:
7  python3 fetch.py <url> [max_chars] [--stealth]
8 
9Modes:
10  (default)   Fast HTTP fetch via Fetcher — works for most sites (~1-3s)
11  --stealth   Headless browser via StealthyFetcher — for JS-rendered or
12              anti-scraping sites like WeChat, Zhihu, Juejin (~5-15s)
13 
14Examples:
15  python3 fetch.py https://sspai.com/post/73145
16  python3 fetch.py https://mp.weixin.qq.com/s/xxx 30000 --stealth
17  python3 fetch.py https://zhuanlan.zhihu.com/p/12345 --stealth
18"""
19 
20import sys
21import re
22import json
23import logging
24 
25 
26def check_dependencies():
27    """Check if required packages are installed and provide install instructions."""
28    missing = []
29    try:
30        import scrapling  # noqa: F401
31    except ImportError:
32        missing.append("scrapling")
33    try:
34        import html2text  # noqa: F401
35    except ImportError:
36        missing.append("html2text")
37 
38    if missing:
39        print(
40            f"Error: missing dependencies: {', '.join(missing)}\n"
41            f"Install with:\n"
42            f"  pip install {' '.join(missing)}",
43            file=sys.stderr,
44        )
45        sys.exit(1)
46 
47 
48def fix_lazy_images(html_raw):
49    """
50    Promote data-src to src for lazy-loaded images (WeChat, Zhihu, etc.).
51    Many Chinese platforms use data-src for the real image URL while src
52    holds a tiny placeholder. html2text only reads src, so we swap them.
53    """
54    return re.sub(
55        r'<img([^>]*?)\sdata-src="([^"]+)"([^>]*?)>',
56        lambda m: f'<img{m.group(1)} src="{m.group(2)}"{m.group(3)}>',
57        html_raw,
58    )
59 
60 
61# CSS selectors in priority order — the first match with enough content wins.
62# Covers most blog/article platforms without needing per-site customization.
63CONTENT_SELECTORS = [
64    "article",
65    "main",
66    ".post-content",
67    ".entry-content",
68    ".article-content",
69    ".article-body",
70    ".article-detail",         # 36kr
71    ".article-holder",         # InfoQ
72    ".post_body",              # 163.com (NetEase)
73    ".markdown-body",          # GitHub
74    ".Post-RichText",          # Zhihu
75    "#article_content",        # CSDN
76    ".article-area",           # Juejin
77    ".ssa-article",            # Toutiao
78    '[role="article"]',
79    '[itemprop="articleBody"]',
80]
81 
82# WeChat has a unique DOM structure — try these first for mp.weixin.qq.com
83WECHAT_SELECTORS = [
84    "div#js_content",
85    "div.rich_media_content",
86]
87 
88# Minimum characters for a selector match to be considered "real content"
89MIN_CONTENT_LENGTH = 200
90 
91 
92def html_to_markdown(html_raw, max_chars=30000):
93    """Convert raw HTML to clean Markdown."""
94    import html2text
95 
96    html_raw = fix_lazy_images(html_raw)
97 
98    h = html2text.HTML2Text()
99    h.ignore_links = False
100    h.ignore_images = False
101    h.body_width = 0       # No line wrapping
102    h.skip_internal_links = True
103    h.ignore_emphasis = False
104 
105    md = h.handle(html_raw)
106    md = re.sub(r"\n{3,}", "\n\n", md).strip()
107    return md[:max_chars]
108 
109 
110def extract_content(page, url, max_chars=30000):
111    """
112    Try content selectors to find the article body.
113    Returns (markdown_text, matched_selector).
114    """
115    is_wechat = "mp.weixin.qq.com" in url
116    selectors = (WECHAT_SELECTORS + CONTENT_SELECTORS) if is_wechat else CONTENT_SELECTORS
117 
118    for selector in selectors:
119        els = page.css(selector)
120        if els:
121            md = html_to_markdown(els[0].html_content, max_chars)
122            if len(md) >= MIN_CONTENT_LENGTH:
123                return md, selector
124 
125    # Fallback: convert the entire page
126    md = html_to_markdown(page.html_content, max_chars)
127    return md, "body(fallback)"
128 
129 
130def _suppress_scrapling_logs():
131    """Scrapling's logger is noisy (deprecation warnings, fetch info). Silence it."""
132    logging.getLogger("scrapling").setLevel(logging.CRITICAL)
133 
134 
135def fetch_fast(url, max_chars=30000, timeout=15):
136    """
137    Fast HTTP fetch — no JavaScript execution.
138    Works for most blogs and static sites.
139    """
140    from scrapling.fetchers import Fetcher
141    _suppress_scrapling_logs()
142 
143    page = Fetcher().get(url, timeout=timeout, stealthy_headers=True)
144    return extract_content(page, url, max_chars)
145 
146 
147def fetch_stealth(url, max_chars=30000, timeout=30000):
148    """
149    Headless browser fetch — executes JavaScript, bypasses anti-scraping.
150    Required for: WeChat articles, Zhihu, Juejin, and other JS-rendered pages.
151    Slower (~5-15s) but more reliable for protected content.
152    """
153    from scrapling.fetchers import StealthyFetcher
154    _suppress_scrapling_logs()
155 
156    page = StealthyFetcher().fetch(
157        url,
158        headless=True,
159        network_idle=True,
160        timeout=timeout,
161    )
162    return extract_content(page, url, max_chars)
163 
164 
165def fetch(url, max_chars=30000, stealth=False):
166    """
167    Main entry point. Fetches URL and returns (markdown, selector, mode).
168    If stealth=False, tries fast mode first and falls back to stealth
169    when the result is too short (likely a JS-rendered page).
170    """
171    if stealth:
172        md, selector = fetch_stealth(url, max_chars)
173        return md, selector, "stealth"
174 
175    # Try fast mode first
176    md, selector = fetch_fast(url, max_chars)
177 
178    # If fast mode got barely any content, the page likely needs JS rendering
179    if len(md) < MIN_CONTENT_LENGTH:
180        try:
181            md_stealth, sel_stealth = fetch_stealth(url, max_chars)
182            if len(md_stealth) > len(md):
183                return md_stealth, sel_stealth, "stealth(auto-fallback)"
184        except Exception:
185            pass  # Stick with fast mode result
186 
187    return md, selector, "fast"
188 
189 
190def main():
191    if len(sys.argv) < 2:
192        print(
193            "Usage: python3 fetch.py <url> [max_chars] [--stealth]\n"
194            "\n"
195            "Options:\n"
196            "  max_chars   Maximum output characters (default: 30000)\n"
197            "  --stealth   Use headless browser for JS-rendered pages\n"
198            "  --json      Output as JSON with metadata\n",
199            file=sys.stderr,
200        )
201        sys.exit(1)
202 
203    url = sys.argv[1]
204    args = sys.argv[2:]
205 
206    stealth = "--stealth" in args
207    json_output = "--json" in args
208    args = [a for a in args if not a.startswith("--")]
209    max_chars = int(args[0]) if args else 30000
210 
211    try:
212        md, selector, mode = fetch(url, max_chars, stealth=stealth)
213 
214        if json_output:
215            result = {
216                "url": url,
217                "mode": mode,
218                "selector": selector,
219                "content_length": len(md),
220                "content": md,
221            }
222            print(json.dumps(result, ensure_ascii=False, indent=2))
223        else:
224            print(md)
225 
226    except Exception as e:
227        error_msg = f"Error fetching {url}: {type(e).__name__}: {e}"
228        if json_output:
229            print(json.dumps({"url": url, "error": error_msg}, ensure_ascii=False))
230        else:
231            print(error_msg, file=sys.stderr)
232        sys.exit(1)
233 
234 
235if __name__ == "__main__":
236    check_dependencies()
237    main()
238
Preparing the source view

Web Content Fetcher — 网页正文提取

scripts/fetch.py