Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/adapters/hn/index.ts
1import { JSDOM } from "jsdom";2import TurndownService from "turndown";3import { gfm } from "turndown-plugin-gfm";4import type { Adapter } from "../types";5import type { ExtractedDocument } from "../../extract/document";6import { collectMediaFromDocument } from "../../media/markdown-media";78const HN_BASE_URL = "https://news.ycombinator.com";910const turndown = new TurndownService({11headingStyle: "atx",12bulletListMarker: "-",13codeBlockStyle: "fenced",14});1516turndown.use(gfm);1718export interface HnItem {19id: number;20type: "story" | "comment" | "job" | "poll" | "pollopt" | string;21by?: string;22time?: number;23text?: string;24title?: string;25url?: string;26score?: number;27descendants?: number;28kids?: number[];29parent?: number;30deleted?: boolean;31dead?: boolean;32}3334export interface HnCommentNode {35item: HnItem;36children: HnCommentNode[];37}3839interface ParsedHnThread {40story: HnItem;41comments: HnCommentNode[];42}4344function decodeHtmlText(value: string | undefined): string | undefined {45if (!value) {46return undefined;47}4849const dom = new JSDOM(`<!doctype html><html><body>${value}</body></html>`);50return dom.window.document.body.textContent?.trim() || undefined;51}5253function normalizeMarkdown(markdown: string): string {54return markdown55.replace(/\r\n/g, "\n")56.replace(/[ \t]+\n/g, "\n")57.replace(/\n{3,}/g, "\n\n")58.trim();59}6061function convertHnHtmlToMarkdown(html: string | undefined, baseUrl: string): string {62if (!html?.trim()) {63return "";64}6566const dom = new JSDOM(`<div id="__root">${html}</div>`, { url: baseUrl });67const root = dom.window.document.querySelector("#__root");68if (!root) {69return "";70}7172root.querySelectorAll("a[href]").forEach((element) => {73const href = element.getAttribute("href");74if (!href) {75return;76}7778try {79element.setAttribute("href", new URL(href, baseUrl).toString());80} catch {81// Ignore malformed URLs and keep the original href.82}83});8485return normalizeMarkdown(turndown.turndown(root.innerHTML));86}8788function formatIsoTimestamp(unixSeconds: number | undefined): string | undefined {89if (!unixSeconds || !Number.isFinite(unixSeconds)) {90return undefined;91}9293return new Date(unixSeconds * 1_000).toISOString();94}9596function formatDisplayTimestamp(unixSeconds: number | undefined): string {97const iso = formatIsoTimestamp(unixSeconds);98if (!iso) {99return "unknown time";100}101102return iso.replace("T", " ").replace(".000Z", " UTC");103}104105function indentMarkdown(markdown: string, spaces: number): string {106const prefix = " ".repeat(spaces);107return markdown108.split("\n")109.map((line) => (line ? `${prefix}${line}` : prefix))110.join("\n");111}112113function renderCommentHeader(item: HnItem, pageUrl: string): string {114const author = item.by ?? "[deleted]";115const time = item.id116? `[${formatDisplayTimestamp(item.time)}](${pageUrl}#${item.id})`117: formatDisplayTimestamp(item.time);118return `${author} · ${time}`;119}120121function renderCommentNode(node: HnCommentNode, pageUrl: string, depth = 0): string {122const baseIndent = " ".repeat(depth * 4);123const lines = [`${baseIndent}- ${renderCommentHeader(node.item, pageUrl)}`];124const body = convertHnHtmlToMarkdown(node.item.text, pageUrl);125126if (body) {127lines.push("");128lines.push(indentMarkdown(body, depth * 4 + 4));129} else if (node.item.deleted || node.item.dead) {130lines.push("");131lines.push(`${baseIndent} [comment unavailable]`);132}133134for (const child of node.children) {135lines.push("");136lines.push(renderCommentNode(child, pageUrl, depth + 1));137}138139return lines.join("\n");140}141142export function buildHnThreadMarkdown(143story: HnItem,144comments: HnCommentNode[],145pageUrl: string,146): string {147const lines: string[] = [];148const storyUrl = story.url ? new URL(story.url, pageUrl).toString() : undefined;149const storyText = convertHnHtmlToMarkdown(story.text, pageUrl);150151if (storyUrl && storyUrl !== pageUrl) {152lines.push(`Source: [${storyUrl}](${storyUrl})`);153}154lines.push(`HN Item: [${story.id}](${pageUrl})`);155156const submittedBy = story.by ? ` by ${story.by}` : "";157const submittedAt = formatDisplayTimestamp(story.time);158lines.push(`Submitted${submittedBy} at ${submittedAt}`);159160const stats: string[] = [];161if (typeof story.score === "number") {162stats.push(`${story.score} points`);163}164if (typeof story.descendants === "number") {165stats.push(`${story.descendants} comments`);166}167if (stats.length > 0) {168lines.push(stats.join(" | "));169}170171if (storyText) {172lines.push("");173lines.push("## Post");174lines.push("");175lines.push(storyText);176}177178lines.push("");179lines.push("## Comments");180lines.push("");181182if (comments.length === 0) {183lines.push("No comments.");184} else {185lines.push(comments.map((comment) => renderCommentNode(comment, pageUrl)).join("\n\n"));186}187188return normalizeMarkdown(lines.join("\n"));189}190191export function buildHnDocument(192story: HnItem,193comments: HnCommentNode[],194pageUrl: string,195): ExtractedDocument {196const decodedTitle = decodeHtmlText(story.title) ?? `HN Item ${story.id}`;197198return {199url: pageUrl,200canonicalUrl: pageUrl,201title: decodedTitle,202author: story.by,203siteName: "Hacker News",204publishedAt: formatIsoTimestamp(story.time),205adapter: "hn",206metadata: {207kind: "hn/story",208storyId: story.id,209storyUrl: story.url ? new URL(story.url, pageUrl).toString() : undefined,210points: story.score,211commentCount: story.descendants,212},213content: [214{215type: "markdown",216markdown: buildHnThreadMarkdown(story, comments, pageUrl),217},218],219};220}221222export function parseHnItemId(url: URL): number | null {223if (url.hostname !== "news.ycombinator.com") {224return null;225}226227if (url.pathname !== "/item") {228return null;229}230231const value = url.searchParams.get("id");232if (!value || !/^\d+$/.test(value)) {233return null;234}235236return Number(value);237}238239function extractUnixSecondsFromAge(element: Element | null): number | undefined {240const title = element?.getAttribute("title")?.trim();241if (!title) {242return undefined;243}244245const match = title.match(/(\d{9,})$/);246return match ? Number(match[1]) : undefined;247}248249function extractScore(text: string | null | undefined): number | undefined {250if (!text) {251return undefined;252}253254const match = text.match(/(\d+)/);255return match ? Number(match[1]) : undefined;256}257258function extractCommentCount(container: ParentNode): number | undefined {259const anchors = Array.from(container.querySelectorAll("a"));260for (const anchor of anchors) {261const match = anchor.textContent?.trim().match(/(\d+)\s+comments?/i);262if (match) {263return Number(match[1]);264}265}266return undefined;267}268269function normalizeStoryUrl(storyId: number, href: string | null | undefined, pageUrl: string): string | undefined {270if (!href) {271return undefined;272}273274try {275const resolved = new URL(href, pageUrl).toString();276if (resolved === pageUrl || resolved === `${HN_BASE_URL}/item?id=${storyId}`) {277return undefined;278}279return resolved;280} catch {281return undefined;282}283}284285export function extractHnThreadFromHtml(html: string, pageUrl: string): ParsedHnThread | null {286const dom = new JSDOM(html, { url: pageUrl });287const { document } = dom.window;288const storyRow = document.querySelector("table.fatitem tr.athing.submission");289if (!storyRow) {290return null;291}292293const storyId = Number(storyRow.getAttribute("id"));294if (!Number.isFinite(storyId)) {295return null;296}297298const titleLink = storyRow.querySelector(".titleline > a");299const subline = document.querySelector("table.fatitem .subline");300const topText = document.querySelector("table.fatitem .toptext");301302const story: HnItem = {303id: storyId,304type: "story",305by: subline?.querySelector(".hnuser")?.textContent?.trim() || undefined,306time: extractUnixSecondsFromAge(subline?.querySelector(".age") ?? null),307title: titleLink?.innerHTML?.trim() || undefined,308url: normalizeStoryUrl(storyId, titleLink?.getAttribute("href"), pageUrl),309text: topText?.innerHTML?.trim() || undefined,310score: extractScore(subline?.querySelector(".score")?.textContent),311descendants: extractCommentCount(subline ?? document),312};313314const roots: HnCommentNode[] = [];315const stack: HnCommentNode[] = [];316317document.querySelectorAll("tr.athing.comtr").forEach((row) => {318const commentId = Number(row.getAttribute("id"));319if (!Number.isFinite(commentId)) {320return;321}322323const indentRaw = row.querySelector("td.ind")?.getAttribute("indent");324const depth = indentRaw && /^\d+$/.test(indentRaw) ? Number(indentRaw) : 0;325const comhead = row.querySelector(".comhead");326const item: HnItem = {327id: commentId,328type: "comment",329by: comhead?.querySelector(".hnuser")?.textContent?.trim() || undefined,330time: extractUnixSecondsFromAge(comhead?.querySelector(".age") ?? null),331text: row.querySelector(".comment > .commtext")?.innerHTML?.trim() || undefined,332deleted: row.querySelector(".comment > .commtext") === null,333};334335const node: HnCommentNode = {336item,337children: [],338};339340while (stack.length > depth) {341stack.pop();342}343344const parent = stack[stack.length - 1];345if (parent) {346parent.children.push(node);347} else {348roots.push(node);349}350351stack.push(node);352});353354return {355story,356comments: roots,357};358}359360export const hnAdapter: Adapter = {361name: "hn",362match(input) {363return parseHnItemId(input.url) !== null;364},365async process(context) {366const itemId = parseHnItemId(context.input.url);367if (!itemId) {368return {369status: "no_document",370};371}372373const pageUrl = context.input.url.toString();374context.log.info(`Loading ${pageUrl} with hn adapter`);375await context.browser.goto(pageUrl, context.timeoutMs);376const html = await context.browser.getHTML();377const thread = extractHnThreadFromHtml(html, pageUrl);378if (!thread) {379return {380status: "no_document",381};382}383384const document = buildHnDocument(thread.story, thread.comments, pageUrl);385return {386status: "ok",387document,388media: collectMediaFromDocument(document),389};390},391};392