Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Extract clean Markdown content from any URL using a three-tier strategy: Jina Reader, Scrapling, or web_fetch.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/fetch.py
1#!/usr/bin/env python32"""3Universal web content extractor (Scrapling + html2text).4Returns clean Markdown with headings, links, images, lists, and code blocks.56Usage:7python3 fetch.py <url> [max_chars] [--stealth]89Modes:10(default) Fast HTTP fetch via Fetcher — works for most sites (~1-3s)11--stealth Headless browser via StealthyFetcher — for JS-rendered or12anti-scraping sites like WeChat, Zhihu, Juejin (~5-15s)1314Examples:15python3 fetch.py https://sspai.com/post/7314516python3 fetch.py https://mp.weixin.qq.com/s/xxx 30000 --stealth17python3 fetch.py https://zhuanlan.zhihu.com/p/12345 --stealth18"""1920import sys21import re22import json23import logging242526def check_dependencies():27"""Check if required packages are installed and provide install instructions."""28missing = []29try:30import scrapling # noqa: F40131except ImportError:32missing.append("scrapling")33try:34import html2text # noqa: F40135except ImportError:36missing.append("html2text")3738if missing:39print(40f"Error: missing dependencies: {', '.join(missing)}\n"41f"Install with:\n"42f" pip install {' '.join(missing)}",43file=sys.stderr,44)45sys.exit(1)464748def fix_lazy_images(html_raw):49"""50Promote data-src to src for lazy-loaded images (WeChat, Zhihu, etc.).51Many Chinese platforms use data-src for the real image URL while src52holds a tiny placeholder. html2text only reads src, so we swap them.53"""54return re.sub(55r'<img([^>]*?)\sdata-src="([^"]+)"([^>]*?)>',56lambda m: f'<img{m.group(1)} src="{m.group(2)}"{m.group(3)}>',57html_raw,58)596061# CSS selectors in priority order — the first match with enough content wins.62# Covers most blog/article platforms without needing per-site customization.63CONTENT_SELECTORS = [64"article",65"main",66".post-content",67".entry-content",68".article-content",69".article-body",70".article-detail", # 36kr71".article-holder", # InfoQ72".post_body", # 163.com (NetEase)73".markdown-body", # GitHub74".Post-RichText", # Zhihu75"#article_content", # CSDN76".article-area", # Juejin77".ssa-article", # Toutiao78'[role="article"]',79'[itemprop="articleBody"]',80]8182# WeChat has a unique DOM structure — try these first for mp.weixin.qq.com83WECHAT_SELECTORS = [84"div#js_content",85"div.rich_media_content",86]8788# Minimum characters for a selector match to be considered "real content"89MIN_CONTENT_LENGTH = 200909192def html_to_markdown(html_raw, max_chars=30000):93"""Convert raw HTML to clean Markdown."""94import html2text9596html_raw = fix_lazy_images(html_raw)9798h = html2text.HTML2Text()99h.ignore_links = False100h.ignore_images = False101h.body_width = 0 # No line wrapping102h.skip_internal_links = True103h.ignore_emphasis = False104105md = h.handle(html_raw)106md = re.sub(r"\n{3,}", "\n\n", md).strip()107return md[:max_chars]108109110def extract_content(page, url, max_chars=30000):111"""112Try content selectors to find the article body.113Returns (markdown_text, matched_selector).114"""115is_wechat = "mp.weixin.qq.com" in url116selectors = (WECHAT_SELECTORS + CONTENT_SELECTORS) if is_wechat else CONTENT_SELECTORS117118for selector in selectors:119els = page.css(selector)120if els:121md = html_to_markdown(els[0].html_content, max_chars)122if len(md) >= MIN_CONTENT_LENGTH:123return md, selector124125# Fallback: convert the entire page126md = html_to_markdown(page.html_content, max_chars)127return md, "body(fallback)"128129130def _suppress_scrapling_logs():131"""Scrapling's logger is noisy (deprecation warnings, fetch info). Silence it."""132logging.getLogger("scrapling").setLevel(logging.CRITICAL)133134135def fetch_fast(url, max_chars=30000, timeout=15):136"""137Fast HTTP fetch — no JavaScript execution.138Works for most blogs and static sites.139"""140from scrapling.fetchers import Fetcher141_suppress_scrapling_logs()142143page = Fetcher().get(url, timeout=timeout, stealthy_headers=True)144return extract_content(page, url, max_chars)145146147def fetch_stealth(url, max_chars=30000, timeout=30000):148"""149Headless browser fetch — executes JavaScript, bypasses anti-scraping.150Required for: WeChat articles, Zhihu, Juejin, and other JS-rendered pages.151Slower (~5-15s) but more reliable for protected content.152"""153from scrapling.fetchers import StealthyFetcher154_suppress_scrapling_logs()155156page = StealthyFetcher().fetch(157url,158headless=True,159network_idle=True,160timeout=timeout,161)162return extract_content(page, url, max_chars)163164165def fetch(url, max_chars=30000, stealth=False):166"""167Main entry point. Fetches URL and returns (markdown, selector, mode).168If stealth=False, tries fast mode first and falls back to stealth169when the result is too short (likely a JS-rendered page).170"""171if stealth:172md, selector = fetch_stealth(url, max_chars)173return md, selector, "stealth"174175# Try fast mode first176md, selector = fetch_fast(url, max_chars)177178# If fast mode got barely any content, the page likely needs JS rendering179if len(md) < MIN_CONTENT_LENGTH:180try:181md_stealth, sel_stealth = fetch_stealth(url, max_chars)182if len(md_stealth) > len(md):183return md_stealth, sel_stealth, "stealth(auto-fallback)"184except Exception:185pass # Stick with fast mode result186187return md, selector, "fast"188189190def main():191if len(sys.argv) < 2:192print(193"Usage: python3 fetch.py <url> [max_chars] [--stealth]\n"194"\n"195"Options:\n"196" max_chars Maximum output characters (default: 30000)\n"197" --stealth Use headless browser for JS-rendered pages\n"198" --json Output as JSON with metadata\n",199file=sys.stderr,200)201sys.exit(1)202203url = sys.argv[1]204args = sys.argv[2:]205206stealth = "--stealth" in args207json_output = "--json" in args208args = [a for a in args if not a.startswith("--")]209max_chars = int(args[0]) if args else 30000210211try:212md, selector, mode = fetch(url, max_chars, stealth=stealth)213214if json_output:215result = {216"url": url,217"mode": mode,218"selector": selector,219"content_length": len(md),220"content": md,221}222print(json.dumps(result, ensure_ascii=False, indent=2))223else:224print(md)225226except Exception as e:227error_msg = f"Error fetching {url}: {type(e).__name__}: {e}"228if json_output:229print(json.dumps({"url": url, "error": error_msg}, ensure_ascii=False))230else:231print(error_msg, file=sys.stderr)232sys.exit(1)233234235if __name__ == "__main__":236check_dependencies()237main()238