/** * thread-keeper * @module utils.TwitterCapture * @author The Harvard Library Innovation Lab * @license MIT */ import fs from "fs"; import { spawnSync } from "child_process"; import { chromium } from "playwright"; import { v4 as uuidv4 } from "uuid"; import { PDFDocument } from "pdf-lib"; import { globSync } from "glob"; import { CERTS_PATH, TMP_PATH, EXECUTABLES_FOLDER, TEMPLATES_PATH, APP_VERSION } from "../const.js"; /** * Generates a "sealed" PDF out of a twitter.com url using Playwright. * * Usage: * ``` * const tweet = new TwitterCapture(url); * const pdf = await tweet.capture(); * fs.writeFileSync("tweet.pdf", pdf); * ``` */ export class TwitterCapture { /** * Defaults for options that can be passed to `TwitterCapture`. * @property {string} appVersion * @property {string} privateKeyPath - Path to `.pem` file containing a private key. * @property {string} certPath - Path to a `.pem` file containing a certificate. * @property {string} tmpFolderPath - Path to a folder in which temporary file can be written. * @property {string} ytDlpPath - Path to the `yt-dlp` executable. * @property {string} templatesFolderPath - Path to the templates folder (t.co resolver summary feature). * @property {string} timestampServerUrl - Timestamping server. * @property {number} networkidleTimeout - Time to wait for "networkidle" state. * @property {boolean} runBrowserBehaviors - If `true`, will try to auto-scroll and open more responses. Set to `false` automatically when trying to capture a profile url. * @property {number} browserBehaviorsTimeout - Maximum browser behaviors execution time. * @property {number} videoCaptureTimeout - Maximum yt-dlp execution time. * @property {number} renderTimeout - Time to wait for re-renders. */ static defaults = { appVersion: APP_VERSION, privateKeyPath: `${CERTS_PATH}key.pem`, certPath: `${CERTS_PATH}cert.pem`, tmpFolderPath: `${TMP_PATH}`, templatesFolderPath: `${TEMPLATES_PATH}pdf-attachments/`, ytDlpPath: `${EXECUTABLES_FOLDER}yt-dlp`, timestampServerUrl: "http://timestamp.digicert.com", networkidleTimeout: 5000, runBrowserBehaviors: true, browserBehaviorsTimeout: 33500, videoCaptureTimeout: 10000, renderTimeout: 4000, }; /** @type {object} */ options = {}; // Based on TwitterCapture.defaults /** @type {?string} */ url = null; /** @type {?string} */ urlType = null; /** * @type {{ * browser: playwright.Browser, * context: playwright.BrowserContext, * page: playwright.Page, * viewport: ?{width: number, height: number}, * ready: boolean * }} */ playwright = { browser: null, context: null, page: null, viewport: null, ready: false }; /** @type {object} */ interceptedJPEGs = {}; /** * @param {string} url - `twitter.com` url to capture. Works best on statuses and threads. * @param {object} options - See `TwitterCapture.defaults` for detailed options. Will use defaults unless overridden. */ constructor(url, options = {}) { this.filterUrl(url); this.filterOptions(options); // Options adjustments: if (this.urlType === "profile") { this.options.runBrowserBehaviors = false; } } /** * Captures the current Twitter.com url and makes it a signed PDF. * @returns {Promise} - Signed PDF. */ capture = async() => { let rawPDF = null; let editablePDF = null; let editedPDF = null; // Playwright init await this.setup(); // Page load + network idle try { await this.playwright.page.goto(this.url, { waitUntil: "networkidle", timeout: this.options.networkidleTimeout, }); } catch(err) { /* Timeout errors are non-blocking */ } // Adjust UI (#1) await this.adjustUIForCapture(); // Run browser behaviors if (this.options.runBrowserBehaviors === true) { await this.runBrowserBehaviors(); } else { await new Promise((resolve) => setTimeout(resolve, this.options.networkidleTimeout + this.options.renderTimeout) ); } // Wait for network idle try { await this.waitForLoadState("networkidle", {timeout: this.options.networkidleTimeout}); } catch(err) { /* Timeout errors are non-blocking */ } // Adjust UI (#2 - Accounts for re-renders) await this.adjustUIForCapture(); // Resize browser to fit document dimensions if (this.urlType !== "profile") { // Skipped on profile pages await this.resizeViewportToFitDocument(); } // Generate raw PDF and open editable PDF rawPDF = await this.generateRawPDF(); editablePDF = await PDFDocument.load(rawPDF); // Remove extraneous page, add metadata try { editablePDF.setTitle(`Capture of ${this.url} by thread-keeper on ${new Date().toISOString()}`); editablePDF.setCreationDate(new Date()); editablePDF.setModificationDate(new Date()); editablePDF.setProducer(`thread-keeper ${this.options.appVersion}`); editablePDF.removePage(1); // This step may throw if there's only 1 page. } catch { // console.log(error); } // Try to crop remaining white space await this.cropMarginsOnPDF(editablePDF); // Add intercepted JPEGs as attachments await this.addInterceptedJPEGsToPDF(editablePDF); // Try to capture t.co to full urls map, and add it as attachment await this.captureAndAddUrlMapToPDF(editablePDF); // Try to capture video, if any, and add it as attachment await this.captureAndAddVideoToPDF(editablePDF); // Freeze edited PDF in memory editedPDF = await editablePDF.save(); // Sign editedPDF = await this.signPDF(editedPDF); // Teardown try { await this.teardown(); } catch { /* Ignore teardown errors */ } // Return buffer return editedPDF; } /** * Sets up the browser used for capture as well as network interception for images capture. * Populates `this.playwright`. * @returns {Promise} */ setup = async() => { const userAgent = chromium._playwright.devices["Pixel 2 XL"].userAgent; const viewport = chromium._playwright.devices["Pixel 2 XL"].viewport; this.playwright.browser = await chromium.launch({ headless: true, channel: "chrome", }); this.playwright.context = await this.playwright.browser.newContext({ userAgent }); this.playwright.page = await this.playwright.context.newPage(); this.playwright.viewport = viewport; this.playwright.page.setViewportSize(viewport); await new Promise(resolve => setTimeout(resolve, 500)); // [Debug] this.playwright.page.on("response", this.interceptJpegs); this.playwright.ready = true; } /** * Closes browser used for capture. */ teardown = async() => { await this.playwright.page.close(); await this.playwright.context.close(); await this.playwright.browser.close(); this.playwright.ready = true; } /** * Adjusts the current page's DOM so the resulting PDF is not affected by UI artifact. * Playwright needs to be ready. * * @returns {Promise} */ adjustUIForCapture = async() => { if (this.playwright.ready !== true) { throw new Error("Playwright is not ready."); } await this.playwright.page.evaluate(async() => { // Nav bar and header document .querySelector("div[data-testid='TopNavBar']") ?.setAttribute("style", "display: none;"); document .querySelector("header") ?.setAttribute("style", "display: none;"); // Bottom bar document .querySelector("div[data-testid='BottomBar']") ?.setAttribute("style", "display: none;"); document .querySelector("div[data-testid='BottomBar']") ?.parentNode ?.setAttribute("style", "display: none;"); document .querySelector("div[data-testid='BottomBar']") ?.parentNode ?.parentNode ?.setAttribute("style", "display: none;"); // Full-screen dialog document .querySelector("div[role='dialog']") ?.setAttribute("style", "display: none;"); // "Log in" bar document .evaluate( "//span[text()='Not now']", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ) ?.singleNodeValue ?.click(); }); } /** * Runs browser behaviors: * - Tries to scroll through the page. * - Tries to click on the next available "Show replies" button. * * Playwright needs to be ready. * * @returns {Promise} */ runBrowserBehaviors = async() => { if (this.playwright.ready !== true) { throw new Error("Playwright is not ready."); } try { await Promise.race([ // Max execution time for the browser behaviors new Promise((resolve) => setTimeout(resolve, this.options.browserBehaviorsTimeout)), // Behaviors script this.playwright.page.evaluate(async () => { let scrollTop = document.documentElement.scrollTop; while (true) { // Auto scroll: +100px every 250ms scrollTop += 100; window.scrollTo({top: scrollTop}); await new Promise(resolve => setTimeout(resolve, 250)); // Auto click on first available "Show replies" button let showRepliesButton = document.evaluate( "//span[text()='Show replies']", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null )?.singleNodeValue; if (showRepliesButton) { showRepliesButton.click(); await new Promise(resolve => setTimeout(resolve, 1500)); } // Break when reaching bottom of page if (scrollTop >= Math.max(document.body.scrollHeight, window.outerHeight)) { break; } } }) ]) } catch(err) { // Ignore behavior errors. // console.log(err); } } /** * Stretches the viewport to match the document's dimensions. * @returns {Promise} */ resizeViewportToFitDocument = async() => { const viewport = await this.getDocumentDimensions(); await this.playwright.page.setViewportSize({ width: viewport.width, height: viewport.height }); await new Promise(resolve => setTimeout(resolve, this.options.renderTimeout)); } /** * Returns the current dimensions of the document. * Playwright needs to be ready. * @returns {Promise<{width: number, height: number}>} */ getDocumentDimensions = async() => { if (this.playwright.ready !== true) { throw new Error("Playwright is not ready."); } return await this.playwright.page.evaluate(() => { const width = Math.max(document.body.scrollWidth, window.outerWidth); const height = Math.max(document.body.scrollHeight, window.outerHeight); return {width, height} }); } /** * Uses Playwright's network interception to capture images and add them to `this.interceptedJPEGs`. * Called whenever Playwright processes an HTTP response. * * @param {playwright.Response} response * @returns {Promise} */ interceptJpegs = async(response) => { try { const headers = await response.allHeaders(); if (["image/jpeg"].includes(headers["content-type"])) { const image = await response.body(); const url = await response.url(); this.interceptedJPEGs[url] = image; } } catch (err) { // Some exchanges can't be captured, and that's okay :). } } /** * Generates a PDF of the current page using Chrome Dev Tools. * Playwright needs to be ready. * * Populates `this.pdf`. * * @returns {Promise} - PDF Bytes */ generateRawPDF = async() => { if (this.playwright.ready !== true) { throw new Error("Playwright is not ready."); } // Scroll up await this.playwright.page.evaluate(() => window.scrollTo({top: 0})); //await new Promise(resolve => setTimeout(resolve, this.options.renderTimeout)); // Generate document await this.playwright.page.emulateMedia({media: 'screen'}); const dimensions = await this.getDocumentDimensions(); return await this.playwright.page.pdf({ printBackground: true, width: dimensions.width, height: dimensions.height }); } /** * Adds entries from `this.interceptedJPEGs` * @param {PDFDocument} - Editable PDF object from `pdf-lib`. * @returns {Promise} */ addInterceptedJPEGsToPDF = async(editablePDF) => { for (const [url, buffer] of Object.entries(this.interceptedJPEGs)) { const parsedUrl = new URL(url); let filename = `${parsedUrl.pathname}${parsedUrl.search}`.replaceAll("/", "-"); if (!filename.endsWith(".jpg")) { filename += ".jpg"; } if (filename.startsWith("-")) { filename = filename.substring(1); } await editablePDF.attach(buffer.buffer, filename, { mimeType: 'image/jpeg', description: `Image captured from ${this.url}`, creationDate: new Date(), modificationDate: new Date(), }); } } /** * Tries to list and resolve all the `t.co` urls on the page, and add the resulting map as an attachment. * * Attachment filename: `url-map.csv`. * Playwright needs to be ready. * * @param {PDFDocument} - Editable PDF object from `pdf-lib`. * @returns {Promise} */ captureAndAddUrlMapToPDF = async(editablePDF) => { if (this.playwright.ready !== true) { throw new Error("Playwright is not ready."); } /** @type {object} */ const map = {}; const filename = "url-map.csv"; let output = ""; // Capture urls to resolve const shortUrls = await this.playwright.page.evaluate(() => { const urls = {}; for (let a of document.querySelectorAll("a[href^='https://t.co']")) { urls[a.getAttribute("href")] = true; } return Object.keys(urls); }); if (shortUrls.length < 1) { return; } for (const url of shortUrls) { map[url] = false; } // Try to resolve urls (in parallel) async function resolveShortUrl(url) { try { const response = await fetch(url, { method: "HEAD" }); map[url] = response.url; } catch(err) { /* console.log(err); */} } await Promise.allSettled(shortUrls.map(url => resolveShortUrl(url))); // Generate and attach CSV output = "short;long\n"; for (let [short, long] of Object.entries(map)) { output += `"${short}";"${long ? long : ''}"\n`; } await editablePDF.attach(Buffer.from(output), filename, { mimeType: 'text/csv', description: `t.co links from ${this.url}`, creationDate: new Date(), modificationDate: new Date(), }); } /** * Tries to capture video(s) from current Twitter url and add them as attachment to the PDF. * @param {PDFDocument} - Editable PDF object from `pdf-lib`. * @returns {Promise} */ captureAndAddVideoToPDF = async(editablePDF) => { const id = uuidv4(); const filepathOut = `${this.options.tmpFolderPath}${id}-%(autonumber)d.mp4`; const ytDlpExecutable = this.options.ytDlpPath; // yt-dlp health check try { const result = spawnSync(ytDlpExecutable, ["--version"], {encoding: "utf8"}); if (result.status !== 0) { throw new Error(result.stderr); } const version = result.stdout.trim(); if (!version.match(/^[0-9]{4}\.[0-9]{2}\.[0-9]{2}$/)) { throw new Error(`Unknown version: ${version}`); } } catch(err) { throw new Error(`"yt-dlp" executable is not available or cannot be executed. ${err}`); } // Capture try { const dlpOptions = [ "--no-warnings", // Prevents pollution of stdout "--no-progress", // (Same as above) "--format", "mp4", // Forces .mp4 format "--output", filepathOut, this.url ]; const spawnOptions = { timeout: this.options.videoCaptureTimeout, encoding: "utf8", }; const result = spawnSync(ytDlpExecutable, dlpOptions, spawnOptions); if (result.status !== 0) { throw new Error(result.stderr); } const videos = globSync(filepathOut.replace("%(autonumber)d", "*")) if (!videos) { return; } let i = 1; for (const file of videos) { const video = fs.readFileSync(file); await editablePDF.attach(video.buffer, `video-${i}.mp4`, { mimeType: 'video/mp4', description: `Video captured from ${this.url}`, creationDate: new Date(), modificationDate: new Date(), }); i++; fs.unlink(file, () => {}); } } catch(err) { } } /** * Tries to remove some of the white space at the bottom of the PDF. * [!] TODO: This is a "let's ship it" hack. We will need to find a better solution. * @param {PDFDocument} editablePDF */ cropMarginsOnPDF = async(editablePDF) => { const page = editablePDF.getPage(0); const originalHeight = page.getHeight(); // Only crop if content > viewport if (this.playwright.viewport.height > originalHeight) { return; } const reductionFactor = this.options.runBrowserBehaviors ? 44 : 88; const newHeight = Math.floor(originalHeight - (originalHeight / 100 * reductionFactor)); const yShift = originalHeight - newHeight; page.setSize(page.getWidth(), newHeight); page.translateContent(0, -yShift); } /** * @param {Buffer} editedPDF - PDF Bytes * @returns {Buffer} - PDF Bytes */ signPDF = async(editedPDF) => { // Save PDF to disk const id = uuidv4(); const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`; const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`; fs.writeFileSync(filepathIn, editedPDF); const run = spawnSync("pyhanko", [ "sign", "addsig", "--field", "Sig1", "--timestamp-url", this.options.timestampServerUrl, "pemder", "--key", this.options.privateKeyPath, "--cert", this.options.certPath, "--no-pass", filepathIn, filepathOut ], {encoding: "utf-8"}); if (run.status !== 0) { throw new Error(run.stderr); } // Load signed file from disk and return editedPDF = fs.readFileSync(filepathOut); fs.unlink(filepathIn, () => {}); fs.unlink(filepathOut, () => {}); return editedPDF; } /** * Applies some basic filtering to new option objects and fills gaps with defaults. * Replaces `this.options` after filtering. * * @param {Promise} newOptions */ filterOptions = async(newOptions) => { const options = {}; const defaults = TwitterCapture.defaults; for (const key of Object.keys(defaults)) { options[key] = key in newOptions ? newOptions[key] : defaults[key]; switch (typeof defaults[key]) { case "boolean": options[key] = Boolean(options[key]); break; case "number": options[key] = Number(options[key]); break; case "string": options[key] = String(options[key]); break; } } this.options = options; } /** * Filters a given URL to ensure it's a `twitter.com` one. * Also asserts it's "type": "status", "search", "profile". * * Automatically populates `this.url` and `this.urlType`. * * @param {string} url * @returns {bool} */ filterUrl = (url) => { /** @type {?URL} */ let parsedUrl = null; /** @type {?string} */ let urlType = null; // Determine if `url` is a valid `twitter.com` and remove known tracking params try { parsedUrl = new URL(url); // Will throw if not a valid url. if (parsedUrl.origin !== "https://twitter.com") { throw new Error(); } parsedUrl.searchParams.delete("s"); parsedUrl.searchParams.delete("t"); parsedUrl.searchParams.delete("ctx"); } catch (err) { throw new Error(`${url} is not a valid Twitter url.`); } // Determine Twitter url "type" if (parsedUrl.pathname.includes("/status/")) { urlType = "status"; } else if (parsedUrl.pathname.includes("/search")) { urlType = "search"; } else { urlType = "profile"; } this.url = parsedUrl.href; this.urlType = urlType; return true; } }