archive.social/app/utils/TwitterCapture.js

/**
 * thread-keeper
 * @module utils.TwitterCapture
 * @author The Harvard Library Innovation Lab
 * @license MIT
 */
import fs from "fs";
import { spawnSync } from "child_process";
import { chromium } from "playwright";

import { v4 as uuidv4 } from "uuid";
import { PDFDocument } from "pdf-lib";
import nunjucks from "nunjucks";

import { CERTS_PATH, TMP_PATH, EXECUTABLES_FOLDER, TEMPLATES_PATH, APP_VERSION } from "../const.js";

/**
 * Generates a "sealed" PDF out of a twitter.com url using Playwright.
 *
 * Usage:
 * ```
 * const tweet = new TwitterCapture(url);
 * const pdf = await tweet.capture();
 * fs.writeFileSync("tweet.pdf", pdf);
 * ```
 */
export class TwitterCapture {
  /**
   * Defaults for options that can be passed to `TwitterCapture`.
   * @property {string} appVersion
   * @property {string} privateKeyPath - Path to `.pem` file containing a private key.
   * @property {string} certPath - Path to a `.pem` file containing a certificate.
   * @property {string} tmpFolderPath - Path to a folder in which temporary file can be written.
   * @property {string} ytDlpPath - Path to the `yt-dlp` executable.
   * @property {string} templatesFolderPath - Path to the templates folder (t.co resolver summary feature).
   * @property {string} timestampServerUrl - Timestamping server.
   * @property {number} networkidleTimeout - Time to wait for "networkidle" state.
   * @property {boolean} runBrowserBehaviors - If `true`, will try to auto-scroll and open more responses. Set to `false` automatically when trying to capture a profile url.
   * @property {number} browserBehaviorsTimeout - Maximum browser behaviors execution time.
   * @property {number} videoCaptureTimeout - Maximum yt-dlp execution time.
   * @property {number} renderTimeout  - Time to wait for re-renders.
   */
  static defaults = {
    appVersion: APP_VERSION,
    privateKeyPath: `${CERTS_PATH}key.pem`,
    certPath: `${CERTS_PATH}cert.pem`,
    tmpFolderPath: `${TMP_PATH}`,
    templatesFolderPath: `${TEMPLATES_PATH}pdf-attachments/`,
    ytDlpPath: `${EXECUTABLES_FOLDER}yt-dlp`,
    timestampServerUrl: "http://timestamp.digicert.com",
    networkidleTimeout: 5000,
    runBrowserBehaviors: true,
    browserBehaviorsTimeout: 33500,
    videoCaptureTimeout: 10000,
    renderTimeout: 4000,
  };

  /** @type {object} */
  options = {}; // Based on TwitterCapture.defaults

  /** @type {?string} */
  url = null;

  /** @type {?string} */
  urlType = null;

  /**
   * @type {{
   *   browser: playwright.Browser,
   *   context: playwright.BrowserContext,
   *   page: playwright.Page,
   *   viewport: ?{width: number, height: number},
   *   ready: boolean
   * }}
   */
  playwright = {
    browser: null,
    context: null,
    page: null,
    viewport: null,
    ready: false
  };

  /** @type {object<string, Buffer>} */
  interceptedJPEGs = {};

  /**
   * @param {string} url - `twitter.com` url to capture. Works best on statuses and threads.
   * @param {object} options - See `TwitterCapture.defaults` for detailed options. Will use defaults unless overridden.
   */
  constructor(url, options = {}) {
    this.filterUrl(url);
    this.filterOptions(options);

    // Options adjustments:
    if (this.urlType === "profile") {
      this.options.runBrowserBehaviors = false;
    }
  }

  /**
   * Captures the current Twitter.com url and makes it a signed PDF.
   * @returns {Promise<Buffer>} - Signed PDF.
   */
  capture = async() => {
    let rawPDF = null;
    let editablePDF = null;
    let editedPDF = null;

    // Playwright init
    await this.setup();

    // Page load + network idle
    try {
      await this.playwright.page.goto(this.url, {
        waitUntil: "networkidle",
        timeout: this.options.networkidleTimeout,
      });
    }
    catch(err) { /* Timeout errors are non-blocking */ }

    // Adjust UI (#1)
    await this.adjustUIForCapture();

    // Run browser behaviors
    if (this.options.runBrowserBehaviors === true) {
      await this.runBrowserBehaviors();
    }
    else {
      await new Promise((resolve) =>
        setTimeout(resolve, this.options.networkidleTimeout + this.options.renderTimeout)
      );
    }

    // Wait for network idle
    try {
      await this.waitForLoadState("networkidle", {timeout: this.options.networkidleTimeout});
    }
    catch(err) { /* Timeout errors are non-blocking */ }

    // Adjust UI (#2 - Accounts for re-renders)
    await this.adjustUIForCapture();

    // Resize browser to fit document dimensions
    if (this.urlType !== "profile") { // Skipped on profile pages
      await this.resizeViewportToFitDocument();
    }

    // Generate raw PDF and open editable PDF
    rawPDF = await this.generateRawPDF();
    editablePDF = await PDFDocument.load(rawPDF);

    // Remove extraneous page, add metadata
    try {
      editablePDF.setTitle(`Capture of ${this.url} by thread-keeper on ${new Date().toISOString()}`);
      editablePDF.setCreationDate(new Date());
      editablePDF.setModificationDate(new Date());
      editablePDF.setProducer(`thread-keeper ${this.options.appVersion}`);
      editablePDF.removePage(1); // This step may throw if there's only 1 page.
    }
    catch {
      // console.log(error);
    }

    // Try to crop remaining white space
    await this.cropMarginsOnPDF(editablePDF);

    // Add intercepted JPEGs as attachments
    await this.addInterceptedJPEGsToPDF(editablePDF);

    // Try to capture t.co to full urls map, and add it as attachment
    await this.captureAndAddUrlMapToPDF(editablePDF);

    // Try to capture video, if any, and add it as attachment
    await this.captureAndAddVideoToPDF(editablePDF);

    // Freeze edited PDF in memory
    editedPDF = await editablePDF.save();

    // Sign
    editedPDF = await this.signPDF(editedPDF);

    // Teardown
    try {
      await this.teardown();
    }
    catch { /* Ignore teardown errors */ }

    // Return buffer
    return editedPDF;
  }

  /**
   * Sets up the browser used for capture as well as network interception for images capture.
   * Populates `this.playwright`.
   * @returns {Promise<void>}
   */
  setup = async() => {
    const userAgent = chromium._playwright.devices["Pixel 2 XL"].userAgent;
    const viewport = chromium._playwright.devices["Pixel 2 XL"].viewport;

    this.playwright.browser = await chromium.launch({
      headless: true,
      channel: "chrome",
    });
    this.playwright.context = await this.playwright.browser.newContext({ userAgent });
    this.playwright.page = await this.playwright.context.newPage();

    this.playwright.viewport = viewport;

    this.playwright.page.setViewportSize(viewport);

    await new Promise(resolve => setTimeout(resolve, 500)); // [Debug]
    this.playwright.page.on("response", this.interceptJpegs);

    this.playwright.ready = true;
  }

  /**
   * Closes browser used for capture.
   */
  teardown = async() => {
    await this.playwright.page.close();
    await this.playwright.context.close();
    await this.playwright.browser.close();
    this.playwright.ready = true;
  }

  /**
   * Adjusts the current page's DOM so the resulting PDF is not affected by UI artifact.
   * Playwright needs to be ready.
   *
   * @returns {Promise<void>}
   */
  adjustUIForCapture = async() => {
    if (this.playwright.ready !== true) {
      throw new Error("Playwright is not ready.");
    }

    await this.playwright.page.evaluate(async() => {
      // Nav bar and header
      document
        .querySelector("div[data-testid='TopNavBar']")
        ?.setAttribute("style", "display: none;");

      document
        .querySelector("header")
        ?.setAttribute("style", "display: none;");

      // Bottom bar
      document
        .querySelector("div[data-testid='BottomBar']")
        ?.setAttribute("style", "display: none;");

      document
        .querySelector("div[data-testid='BottomBar']")
        ?.parentNode
        ?.setAttribute("style", "display: none;");

      document
        .querySelector("div[data-testid='BottomBar']")
        ?.parentNode
        ?.parentNode
        ?.setAttribute("style", "display: none;");

      // Full-screen dialog
      document
        .querySelector("div[role='dialog']")
        ?.setAttribute("style", "display: none;");

      // "Log in" bar
      document
        .evaluate(
          "//span[text()='Not now']",
          document,
          null,
          XPathResult.FIRST_ORDERED_NODE_TYPE,
          null
        )
        ?.singleNodeValue
        ?.click();
    });
  }

  /**
   * Runs browser behaviors:
   * - Tries to scroll through the page.
   * - Tries to click on the next available "Show replies" button.
   *
   * Playwright needs to be ready.
   *
   * @returns {Promise<void>}
   */
  runBrowserBehaviors = async() => {
    if (this.playwright.ready !== true) {
      throw new Error("Playwright is not ready.");
    }

    try {
      await Promise.race([
        // Max execution time for the browser behaviors
        new Promise((resolve) => setTimeout(resolve, this.options.browserBehaviorsTimeout)),

        // Behaviors script
        this.playwright.page.evaluate(async () => {
          let scrollTop = document.documentElement.scrollTop;

          while (true) {
            // Auto scroll: +100px every 250ms
            scrollTop += 100;
            window.scrollTo({top: scrollTop});
            await new Promise(resolve => setTimeout(resolve, 250));

            // Auto click on first available "Show replies" button
            let showRepliesButton = document.evaluate(
              "//span[text()='Show replies']",
              document,
              null,
              XPathResult.FIRST_ORDERED_NODE_TYPE,
              null
            )?.singleNodeValue;

            if (showRepliesButton) {
              showRepliesButton.click();
              await new Promise(resolve => setTimeout(resolve, 1500));
            }

            // Break when reaching bottom of page
            if (scrollTop >= Math.max(document.body.scrollHeight, window.outerHeight)) {
              break;
            }
          }
        })
      ])
    }
    catch(err) {
      // Ignore behavior errors.
      // console.log(err);
    }
  }

  /**
   * Stretches the viewport to match the document's dimensions.
   * @returns {Promise<void>}
   */
  resizeViewportToFitDocument = async() => {
    const viewport = await this.getDocumentDimensions();

    await this.playwright.page.setViewportSize({
      width: viewport.width,
      height: viewport.height
    });

    await new Promise(resolve => setTimeout(resolve, this.options.renderTimeout));
  }

  /**
   * Returns the current dimensions of the document.
   * Playwright needs to be ready.
   * @returns {Promise<{width: number, height: number}>}
   */
  getDocumentDimensions = async() => {
    if (this.playwright.ready !== true) {
      throw new Error("Playwright is not ready.");
    }

    return await this.playwright.page.evaluate(() =>  {
      const width = Math.max(document.body.scrollWidth, window.outerWidth);
      const height = Math.max(document.body.scrollHeight, window.outerHeight);
      return {width, height}
    });
  }

  /**
   * Uses Playwright's network interception to capture images and add them to `this.interceptedJPEGs`.
   * Called whenever Playwright processes an HTTP response.
   *
   * @param {playwright.Response} response
   * @returns {Promise<void>}
   */
  interceptJpegs = async(response) => {
    try {
      const headers = await response.allHeaders();

      if (["image/jpeg"].includes(headers["content-type"])) {
        const image = await response.body();
        const url = await response.url();
        this.interceptedJPEGs[url] = image;
      }
    }
    catch (err) {
      // Some exchanges can't be captured, and that's okay :).
    }
  }

  /**
   * Generates a PDF of the current page using Chrome Dev Tools.
   * Playwright needs to be ready.
   *
   * Populates `this.pdf`.
   *
   * @returns {Promise<Buffer>} - PDF Bytes
   */
  generateRawPDF = async() => {
    if (this.playwright.ready !== true) {
      throw new Error("Playwright is not ready.");
    }

    // Scroll up
    await this.playwright.page.evaluate(() => window.scrollTo({top: 0}));
    //await new Promise(resolve => setTimeout(resolve, this.options.renderTimeout));

    // Generate document
    await this.playwright.page.emulateMedia({media: 'screen'});
    const dimensions = await this.getDocumentDimensions();

    return await this.playwright.page.pdf({
      printBackground: true,
      width: dimensions.width,
      height: dimensions.height
    });
  }

  /**
   * Adds entries from `this.interceptedJPEGs`
   * @param {PDFDocument} - Editable PDF object from `pdf-lib`.
   * @returns {Promise<void>}
   */
  addInterceptedJPEGsToPDF = async(editablePDF) => {
    for (const [url, buffer] of Object.entries(this.interceptedJPEGs)) {
      const parsedUrl = new URL(url);
      let filename = `${parsedUrl.pathname}${parsedUrl.search}`.replaceAll("/", "-");

      if (!filename.endsWith(".jpg")) {
        filename += ".jpg";
      }

      if (filename.startsWith("-")) {
        filename = filename.substring(1);
      }

      await editablePDF.attach(buffer.buffer, filename, {
        mimeType: 'image/jpeg',
        description: `Image captured from ${this.url}`,
        creationDate: new Date(),
        modificationDate: new Date(),
      });
    }
  }

  /**
   * Tries to list and resolve all the `t.co` urls on the page, and add the resulting map as an attachment.
   *
   * Attachment filename: `url-map.csv`.
   * Playwright needs to be ready.
   *
   * @param {PDFDocument} - Editable PDF object from `pdf-lib`.
   * @returns {Promise<void>}
   */
  captureAndAddUrlMapToPDF = async(editablePDF) => {
    if (this.playwright.ready !== true) {
      throw new Error("Playwright is not ready.");
    }

    /** @type {object<string, boolean|string>} */
    const map = {};
    const filename = "url-map.csv";
    let output = "";

    // Capture urls to resolve
    const shortUrls = await this.playwright.page.evaluate(() => {
      const urls = {};

      for (let a of document.querySelectorAll("a[href^='https://t.co']")) {
        urls[a.getAttribute("href")] = true;
      }

      return Object.keys(urls);
    });

    if (shortUrls.length < 1) {
      return;
    }

    for (const url of shortUrls) {
      map[url] = false;
    }

    // Try to resolve urls (in parallel)
    async function resolveShortUrl(url) {
      try {
        const response = await fetch(url, { method: "HEAD" });
        map[url] = response.url;
      }
      catch(err) { /* console.log(err); */}
    }

    await Promise.allSettled(shortUrls.map(url => resolveShortUrl(url)));

    // Generate and attach CSV
    output = "short;long\n";
    for (let [short, long] of Object.entries(map)) {
      output += `"${short}";"${long ? long : ''}"\n`;
    }

    await editablePDF.attach(Buffer.from(output), filename, {
      mimeType: 'text/csv',
      description: `t.co links from ${this.url}`,
      creationDate: new Date(),
      modificationDate: new Date(),
    });
  }

  /**
   * Tries to capture main video from current Twitter url and add it as attachment to the PDF.
   * @param {PDFDocument} - Editable PDF object from `pdf-lib`.
   * @returns {Promise<void>}
   */
  captureAndAddVideoToPDF = async(editablePDF) => {
    const id = uuidv4();
    const filepathOut = `${this.options.tmpFolderPath}${id}.mp4`;
    const ytDlpExecutable = this.options.ytDlpPath;

    // yt-dlp health check
    try {
      const result = spawnSync(ytDlpExecutable, ["--version"], {encoding: "utf8"});

      if (result.status !== 0) {
        throw new Error(result.stderr);
      }

      const version = result.stdout.trim();

      if (!version.match(/^[0-9]{4}\.[0-9]{2}\.[0-9]{2}$/)) {
        throw new Error(`Unknown version: ${version}`);
      }
    }
    catch(err) {
      throw new Error(`"yt-dlp" executable is not available or cannot be executed. ${err}`);
    }

    // Capture
    try {
      const dlpOptions = [
        "--no-warnings", // Prevents pollution of stdout
        "--no-progress", // (Same as above)
        "--format", "mp4", // Forces .mp4 format
        "--output", filepathOut,
        this.url
      ];

      const spawnOptions = {
        timeout: this.options.videoCaptureTimeout,
        encoding: "utf8",
      };

      const result = spawnSync(ytDlpExecutable, dlpOptions, spawnOptions);

      if (result.status !== 0) {
        throw new Error(result.stderr);
      }

      const video = fs.readFileSync(filepathOut);

      if (!video) {
        return;
      }

      await editablePDF.attach(video.buffer, "video.mp4", {
        mimeType: 'video/mp4',
        description: `Video captured from ${this.url}`,
        creationDate: new Date(),
        modificationDate: new Date(),
      });

      fs.unlink(filepathOut, () => {});
    }
    catch(err) { }
  }

  /**
   * Tries to remove some of the white space at the bottom of the PDF.
   * [!] TODO: This is a "let's ship it" hack. We will need to find a better solution.
   * @param {PDFDocument} editablePDF
   */
  cropMarginsOnPDF = async(editablePDF) => {
    const page = editablePDF.getPage(0);
    const originalHeight = page.getHeight();

    // Only crop if content > viewport
    if (this.playwright.viewport.height > originalHeight) {
      return;
    }

    const reductionFactor = this.options.runBrowserBehaviors ? 44 : 88;

    const newHeight = Math.floor(originalHeight - (originalHeight / 100 * reductionFactor));
    const yShift = originalHeight - newHeight;

    page.setSize(page.getWidth(), newHeight);
    page.translateContent(0, -yShift);
  }

  /**
   * @param {Buffer} editedPDF - PDF Bytes
   * @returns {Buffer} - PDF Bytes
   */
  signPDF = async(editedPDF) => {
    // Save PDF to disk
    const id = uuidv4();
    const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
    const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
    fs.writeFileSync(filepathIn, editedPDF);

    const run = spawnSync("pyhanko",
    [
      "sign",
      "addsig",
        "--field", "Sig1",
        "--timestamp-url", this.options.timestampServerUrl,
      "pemder",
        "--key", this.options.privateKeyPath,
        "--cert", this.options.certPath,
        "--no-pass",
      filepathIn,
      filepathOut
    ],
    {encoding: "utf-8"});


    if (run.status !== 0) {
      throw new Error(run.stderr);
    }

    // Load signed file from disk and return
    editedPDF = fs.readFileSync(filepathOut);
    fs.unlink(filepathIn, () => {});
    fs.unlink(filepathOut, () => {});
    return editedPDF;
  }

  /**
   * Applies some basic filtering to new option objects and fills gaps with defaults.
   * Replaces `this.options` after filtering.
   *
   * @param {Promise<object>} newOptions
   */
  filterOptions = async(newOptions) => {
    const options = {};
    const defaults = TwitterCapture.defaults;

    for (const key of Object.keys(defaults)) {
      options[key] = key in newOptions ? newOptions[key] : defaults[key];

      switch (typeof defaults[key]) {
        case "boolean":
          options[key] = Boolean(options[key]);
          break;

        case "number":
          options[key] = Number(options[key]);
          break;

        case "string":
          options[key] = String(options[key]);
          break;
      }
    }

    this.options = options;
  }

  /**
   * Filters a given URL to ensure it's a `twitter.com` one.
   * Also asserts it's "type": "status", "search", "profile".
   *
   * Automatically populates `this.url` and `this.urlType`.
   *
   * @param {string} url
   * @returns {bool}
   */
  filterUrl = (url) => {
    /** @type {?URL} */
    let parsedUrl = null;

    /** @type {?string} */
    let urlType = null;

    // Determine if `url` is a valid `twitter.com` and remove known tracking params
    try {
      parsedUrl = new URL(url); // Will throw if not a valid url.

      if (parsedUrl.origin !== "https://twitter.com") {
        throw new Error();
      }

      parsedUrl.searchParams.delete("s");
      parsedUrl.searchParams.delete("t");
      parsedUrl.searchParams.delete("ctx");
    }
    catch (err) {
      throw new Error(`${url} is not a valid Twitter url.`);
    }

    // Determine Twitter url "type"
    if (parsedUrl.pathname.includes("/status/")) {
      urlType = "status";
    }
    else if (parsedUrl.pathname.includes("/search")) {
      urlType = "search";
    }
    else {
      urlType = "profile";
    }

    this.url = parsedUrl.href;
    this.urlType = urlType;

    return true;
  }
}