Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/adapters/x/article.ts
1import type { ExtractedDocument } from "../../extract/document";2import {3findTweetNode,4findTweetNodeById,5formatMediaList,6formatTweetAuthor,7getTweetAuthorMetadata,8getTweetText,9getUser,10isRecord,11normalizeTitle,12resolveBestXVideoVariantUrl,13toHighResXImageUrl,14toXTweet,15} from "./shared";16import type { JsonObject } from "./types";1718interface ArticleMedia {19kind: "image" | "video";20url: string;21}2223function resolveArticleMedia(mediaInfo: JsonObject): ArticleMedia | null {24const videoUrl = resolveBestXVideoVariantUrl(mediaInfo);25if (videoUrl) {26return {27kind: "video",28url: videoUrl,29};30}3132const rawUrl =33(typeof mediaInfo.original_img_url === "string" && mediaInfo.original_img_url) ||34(typeof mediaInfo.url === "string" && mediaInfo.url) ||35"";3637if (!rawUrl) {38return null;39}4041return {42kind: "image",43url: toHighResXImageUrl(rawUrl),44};45}4647function resolveArticleMediaUrl(mediaInfo: JsonObject): string {48return resolveArticleMedia(mediaInfo)?.url ?? "";49}5051function normalizeEntityMap(entityMap: unknown): Map<string, JsonObject> {52const normalized = new Map<string, JsonObject>();5354if (Array.isArray(entityMap)) {55for (const entry of entityMap) {56if (!isRecord(entry)) {57continue;58}5960const key =61typeof entry.key === "string" || typeof entry.key === "number"62? String(entry.key)63: undefined;64const value = isRecord(entry.value) ? entry.value : undefined;65if (!key || !value) {66continue;67}68normalized.set(key, value);69}7071return normalized;72}7374if (!isRecord(entityMap)) {75return normalized;76}7778for (const [key, value] of Object.entries(entityMap)) {79if (!isRecord(value)) {80continue;81}82normalized.set(key, value);83}8485return normalized;86}8788function getEntityMarkdown(entityMap: Map<string, JsonObject>, entityKey: unknown): string | null {89const key =90typeof entityKey === "string" || typeof entityKey === "number"91? String(entityKey)92: undefined;93if (!key) {94return null;95}9697const entity = entityMap.get(key);98if (!entity || entity.type !== "MARKDOWN") {99return null;100}101102const data = isRecord(entity.data) ? entity.data : {};103if (typeof data.markdown !== "string") {104return null;105}106107const markdown = data.markdown.trim();108return markdown || null;109}110111function getLinkUrl(entityMap: Map<string, JsonObject>, entityKey: unknown): string | null {112const key =113typeof entityKey === "string" || typeof entityKey === "number"114? String(entityKey)115: undefined;116if (!key) {117return null;118}119120const entity = entityMap.get(key);121if (!entity || entity.type !== "LINK") {122return null;123}124125const data = isRecord(entity.data) ? entity.data : {};126const candidates = [127data.expanded_url,128data.expandedUrl,129data.original_url,130data.originalUrl,131data.url,132data.display_url,133data.displayUrl,134];135136for (const candidate of candidates) {137if (typeof candidate === "string" && candidate.trim()) {138return candidate.trim();139}140}141142return null;143}144145function getTweetId(entityMap: Map<string, JsonObject>, entityKey: unknown): string | null {146const key =147typeof entityKey === "string" || typeof entityKey === "number"148? String(entityKey)149: undefined;150if (!key) {151return null;152}153154const entity = entityMap.get(key);155if (!entity || entity.type !== "TWEET") {156return null;157}158159const data = isRecord(entity.data) ? entity.data : {};160if (typeof data.tweetId !== "string") {161return null;162}163164return data.tweetId;165}166167function buildMediaMap(articleResult: JsonObject): Map<string, ArticleMedia> {168const mediaMap = new Map<string, ArticleMedia>();169const mediaEntities = Array.isArray(articleResult.media_entities) ? articleResult.media_entities : [];170171for (const entity of mediaEntities) {172if (!isRecord(entity) || typeof entity.media_id !== "string" || !isRecord(entity.media_info)) {173continue;174}175176const media = resolveArticleMedia(entity.media_info);177if (media) {178mediaMap.set(entity.media_id, media);179}180}181182const coverMedia = isRecord(articleResult.cover_media) ? articleResult.cover_media : null;183if (coverMedia && typeof coverMedia.media_id === "string" && isRecord(coverMedia.media_info)) {184const media = resolveArticleMedia(coverMedia.media_info);185if (media) {186mediaMap.set(coverMedia.media_id, media);187}188}189190return mediaMap;191}192193function getMediaMarkdown(194entityMap: Map<string, JsonObject>,195entityKey: unknown,196mediaMap: Map<string, ArticleMedia>,197): string[] {198const key =199typeof entityKey === "string" || typeof entityKey === "number"200? String(entityKey)201: undefined;202if (!key) {203return [];204}205206const entity = entityMap.get(key);207if (!entity || entity.type !== "MEDIA") {208return [];209}210211const data = isRecord(entity.data) ? entity.data : {};212const mediaItems = Array.isArray(data.mediaItems) ? data.mediaItems : [];213const media: ArticleMedia[] = [];214215for (const item of mediaItems) {216if (!isRecord(item) || typeof item.mediaId !== "string") {217continue;218}219const mediaItem = mediaMap.get(item.mediaId);220if (mediaItem && !media.some((value) => value.url === mediaItem.url)) {221media.push(mediaItem);222}223}224225return media.map((item) => item.kind === "image" ? `` : `[video](${item.url})`);226}227228function resolveTweetMarkdown(payloads: unknown[], tweetId: string, pageUrl: string): string | null {229for (const payload of payloads) {230const tweet = findTweetNodeById(payload, tweetId);231if (!tweet) {232continue;233}234235const xTweet = toXTweet(tweet, pageUrl);236const author = formatTweetAuthor(xTweet) ?? xTweet.url;237const lines = [`> ${author}`, ...xTweet.text.split("\n").map((line) => `> ${line}`)];238239const media = formatMediaList(xTweet.media).map((line) =>240line.startsWith("photo: ") ? `> })` : `> - ${line}`,241);242243const parts = [lines.join("\n")];244if (media.length > 0) {245parts.push([">", ...media].join("\n"));246}247parts.push(`> ${xTweet.url}`);248249return parts.join("\n").trim();250}251252return `> Embedded tweet: https://x.com/i/status/${tweetId}`;253}254255function replaceLinkEntities(text: string, block: JsonObject, entityMap: Map<string, JsonObject>): string {256const entityRanges = Array.isArray(block.entityRanges) ? block.entityRanges : [];257const replacements = entityRanges258.filter((range): range is JsonObject => isRecord(range))259.map((range) => {260const offset = typeof range.offset === "number" ? range.offset : -1;261const length = typeof range.length === "number" ? range.length : -1;262const url = getLinkUrl(entityMap, range.key);263return { offset, length, url };264})265.filter((range) => range.offset >= 0 && range.length > 0 && range.url)266.sort((left, right) => right.offset - left.offset);267268let next = text;269for (const replacement of replacements) {270next =271next.slice(0, replacement.offset) +272replacement.url +273next.slice(replacement.offset + replacement.length);274}275return next;276}277278function renderAtomicBlock(279block: JsonObject,280entityMap: Map<string, JsonObject>,281mediaMap: Map<string, ArticleMedia>,282payloads: unknown[],283pageUrl: string,284): string | null {285const entityRanges = Array.isArray(block.entityRanges) ? block.entityRanges : [];286const parts: string[] = [];287288for (const range of entityRanges) {289if (!isRecord(range)) {290continue;291}292293const markdown = getEntityMarkdown(entityMap, range.key);294if (markdown) {295parts.push(markdown);296continue;297}298299const mediaMarkdown = getMediaMarkdown(entityMap, range.key, mediaMap);300if (mediaMarkdown.length > 0) {301parts.push(mediaMarkdown.join("\n\n"));302continue;303}304305const tweetId = getTweetId(entityMap, range.key);306if (tweetId) {307const tweetMarkdown = resolveTweetMarkdown(payloads, tweetId, pageUrl);308if (tweetMarkdown) {309parts.push(tweetMarkdown);310}311}312}313314if (parts.length === 0) {315return null;316}317318return parts.join("\n\n");319}320321function renderArticleBlocks(322blocks: unknown[],323entityMap: Map<string, JsonObject>,324mediaMap: Map<string, ArticleMedia>,325payloads: unknown[],326pageUrl: string,327): string {328const parts: string[] = [];329let orderedCounter = 0;330331for (const block of blocks) {332if (!isRecord(block)) {333continue;334}335336const blockType = typeof block.type === "string" ? block.type : "unstyled";337const rawText = typeof block.text === "string" ? block.text : "";338const text = replaceLinkEntities(rawText, block, entityMap).trim();339if (!text && blockType !== "atomic") {340continue;341}342343if (blockType !== "ordered-list-item") {344orderedCounter = 0;345}346347switch (blockType) {348case "header-one":349parts.push(`# ${text}`);350break;351case "header-two":352parts.push(`## ${text}`);353break;354case "header-three":355parts.push(`### ${text}`);356break;357case "blockquote":358parts.push(`> ${text}`);359break;360case "unordered-list-item":361parts.push(`- ${text}`);362break;363case "ordered-list-item":364orderedCounter += 1;365parts.push(`${orderedCounter}. ${text}`);366break;367case "code-block":368parts.push(`\`\`\`\n${text}\n\`\`\``);369break;370case "atomic": {371const markdown = renderAtomicBlock(block, entityMap, mediaMap, payloads, pageUrl);372if (markdown) {373parts.push(markdown);374}375break;376}377default:378parts.push(text);379break;380}381}382383return parts.join("\n\n").trim();384}385386function getArticleResult(tweet: JsonObject): JsonObject | null {387if (388isRecord(tweet.article) &&389isRecord(tweet.article.article_results) &&390isRecord(tweet.article.article_results.result)391) {392return tweet.article.article_results.result as JsonObject;393}394return null;395}396397function extractSummary(markdown: string): string | undefined {398const segments = markdown399.split(/\n\n+/)400.map((segment) => segment.trim())401.filter(Boolean);402403const preferred = segments.find((segment) => !/^(#|>|- |\d+\. |\`\`\`)/.test(segment));404return preferred?.slice(0, 220);405}406407export function extractArticleDocumentFromPayload(408payload: unknown,409statusId: string,410pageUrl: string,411payloads: unknown[] = [payload],412): ExtractedDocument | null {413const tweet = findTweetNode(payload, statusId);414if (!tweet) {415return null;416}417418const articleResult = getArticleResult(tweet);419if (!articleResult) {420return null;421}422423const title = typeof articleResult.title === "string" ? articleResult.title.trim() : undefined;424const contentState = isRecord(articleResult.content_state) ? articleResult.content_state : {};425const blocks = Array.isArray(contentState.blocks) ? contentState.blocks : [];426const entityMap = normalizeEntityMap(contentState.entityMap);427const mediaMap = buildMediaMap(articleResult);428const richMarkdown = renderArticleBlocks(blocks, entityMap, mediaMap, payloads, pageUrl);429const plainText = typeof articleResult.plain_text === "string" ? articleResult.plain_text.trim() : "";430const markdown = richMarkdown || plainText || getTweetText(tweet);431if (!markdown) {432return null;433}434435const xTweet = toXTweet(tweet, pageUrl);436const user = getUser(tweet);437const coverMedia = isRecord(articleResult.cover_media) ? articleResult.cover_media : null;438const coverMediaInfo = coverMedia && isRecord(coverMedia.media_info) ? coverMedia.media_info : null;439const coverImage = coverMediaInfo ? resolveArticleMediaUrl(coverMediaInfo) || undefined : undefined;440441return {442url: pageUrl,443canonicalUrl: xTweet.url,444title: title || normalizeTitle(xTweet.text, "X Article"),445author: formatTweetAuthor(xTweet),446siteName: "X",447publishedAt: xTweet.createdAt,448summary: extractSummary(markdown) || xTweet.text.slice(0, 200) || undefined,449adapter: "x",450metadata: {451kind: "x/article",452tweetId: xTweet.id,453coverImage,454authorName: xTweet.authorName ?? user.name,455authorUsername: xTweet.author ?? user.screenName,456authorUrl: (xTweet.author ?? user.screenName) ? `https://x.com/${xTweet.author ?? user.screenName}` : undefined,457...getTweetAuthorMetadata(xTweet),458},459content: [{ type: "markdown", markdown }],460};461}462