pull/1/head
Matteo Cargnelutti 2022-11-15 11:53:23 -05:00
rodzic 538b323f5a
commit 2c3451045b
28 zmienionych plików z 4515 dodań i 1 usunięć

10
.gitignore vendored 100644
Wyświetl plik

@ -0,0 +1,10 @@
node_modules
*.p12
*.pem
test.js
.DS_Store
app/data/*.json
app/data/*.tsv
yt-dlp
app/tmp/*.pdf
app/tmp/*.mp4

4
.prettierrc 100644
Wyświetl plik

@ -0,0 +1,4 @@
{
"tabWidth": 2,
"printWidth": 100
}

Wyświetl plik

@ -1,2 +1,48 @@
# archive.social
Working title / Work in progress / Don't mind me
> 🚧 Work In Progress
---
## Summary
- [Dependencies](#dependencies)
- [Local development](#local-development)
---
## Dependencies
### Runtimes
- [Node.js](https://nodejs.org/) 18+
- [Python](https://www.python.org/) 3.9+.
### Browsers
- Google Chrome _(`npx playwright install --force chrome` may be used)_.
### Python dependencies
- ⚠️ For now: Python dependencies are installed at machine level, as a post-install step of `npm install`.
### Known Debian / Ubuntu packages
```
curl bash gcc g++ python3 python3-pip python3-dev zlib1g zlib1g-dev libjpeg-dev libssl-dev libffi-dev ghostscript poppler-utils
```
Node may be sourced from [Nodesource](https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions).
### For development on Mac OS
A `brewfile` is available. Run `brew bundle` to install machine-level dependencies that can be provided by [homebrew](https://brew.sh/).
[☝️ Back to summary](#summary)
---
## Local development
> 🚧 WIP
```bash
brew bundle # (Mac OS only) - See Linux dependencies above.
npm install
npx playwright install chrome
npm run generate-local-cert # Will generate a certificate for self-signing PDFs
npm run dev
```

48
app/const.js 100644
Wyświetl plik

@ -0,0 +1,48 @@
/**
* archive.social
* @module const.js
* @author The Harvard Library Innovation Lab
* @license MIT
*/
/**
* Path to the folder holding the certificates used for signing the PDFs.
* @constant
*/
export const CERTS_PATH = `${process.env.PWD}/certs/`;
/**
* Path to the folder in which temporary files will be written by the app.
* @constant
*/
export const TMP_PATH = `${process.env.PWD}/app/tmp/`;
/**
* Path to the "data" folder.
*/
export const DATA_PATH = `${process.env.PWD}/app/data/`;
/**
* Path to the "templates" folder.
*/
export const TEMPLATES_PATH = `${process.env.PWD}/app/templates/`;
/**
* Path to the "executables" folder.
*/
export const EXECUTABLES_FOLDER = `${process.env.PWD}/executables/`;
/**
* Path to the "static" folder.
*/
export const STATIC_PATH = `${process.env.PWD}/app/static/`;
/**
* Maximum capture processes that can be run in parallel.
*/
export const MAX_PARALLEL_CAPTURES_TOTAL = 100;
/**
* Maximum capture processes that can be run in parallel for a given key.
*/
export const MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY = 2;

0
app/data/.keep 100644
Wyświetl plik

215
app/server.js 100644
Wyświetl plik

@ -0,0 +1,215 @@
/**
* archive.social
* @module server.js
* @author The Harvard Library Innovation Lab
* @license MIT
*/
import fs from "fs";
import assert from "assert";
import { validate as uuidValidate } from 'uuid';
import nunjucks from "nunjucks";
import { AccessKeys, SuccessLog, TwitterCapture } from "./utils/index.js";
import {
TEMPLATES_PATH,
STATIC_PATH,
MAX_PARALLEL_CAPTURES_TOTAL,
MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY,
} from "./const.js";
/**
* Keeps track of how many capture processes are currently running.
* May be used to redirect users if over capacity.
*
* [!] This needs to be upgraded to proper rate limiting after launch.
*
* @type {{
* currentTotal: number,
* maxTotal: number,
* currentByAccessKey: object.<string, number>,
* maxPerAccessKey: number
* }}
*/
const CAPTURES_WATCH = {
currentTotal: 0,
maxTotal: MAX_PARALLEL_CAPTURES_TOTAL,
currentByAccessKey: {},
maxPerAccessKey: MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY,
}
/**
* Frozen copy of currently valid access keys.
* [!] For this alpha: app needs to be restarted for changes to be into account.
*/
const ACCESS_KEYS = AccessKeys.fetch();
export default async function (fastify, opts) {
// Adds support for `application/x-www-form-urlencoded`
fastify.register(import('@fastify/formbody'));
// Serves files from STATIC_PATH
fastify.register(import('@fastify/static'), {
root: STATIC_PATH,
prefix: '/static/',
});
/**
* [GET] /
* Shows the landing page / form.
*/
fastify.get('/', async (request, reply) => {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`);
return reply
.code(200)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
});
fastify.get('/test', async (request, reply) => {
const html = nunjucks.render(`${TEMPLATES_PATH}success.njk`, {
pdfBase64: fs.readFileSync("download.pdf").toString("base64"),
url: "https://twitter.com/doctorow/status/1591759999323492358"
});
return reply
.code(200)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
});
/**
* [POST] /
* Processes a request to capture a twitter url.
* Renders success page with PDF if capture went through.
* Returns to form with specific error code, passed as `errorReason`, otherwise.
*
*/
fastify.post('/', async (request, reply) => {
const data = request.body;
const accessKey = data["access-key"];
request.log.info(`Capture capacity: ${CAPTURES_WATCH.currentTotal} / ${CAPTURES_WATCH.maxTotal}.`);
//
// Check access key
//
try {
assert(uuidValidate(accessKey));
assert(ACCESS_KEYS[accessKey]);
}
catch(err) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "ACCESS-KEY"
});
return reply
.code(401)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check url
//
try {
const url = new URL(data.url);
assert(url.origin === "https://twitter.com");
}
catch(err) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "URL"
});
return reply
.code(400)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check that there is still capture capacity (total)
//
if (CAPTURES_WATCH.currentTotal >= CAPTURES_WATCH.maxTotal) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "TOO-MANY-CAPTURES-TOTAL"
});
return reply
.code(503)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check that there is still capture capacity (for this access key)
//
if (CAPTURES_WATCH.currentByAccessKey[accessKey] >= CAPTURES_WATCH.maxPerAccessKey) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "TOO-MANY-CAPTURES-USER"
});
return reply
.code(429)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Process capture request
//
try {
CAPTURES_WATCH.currentTotal += 1;
if (accessKey in CAPTURES_WATCH.currentByAccessKey) {
CAPTURES_WATCH.currentByAccessKey[accessKey] += 1;
}
else {
CAPTURES_WATCH.currentByAccessKey[accessKey] = 1;
}
const tweets = new TwitterCapture(data.url);
const pdf = await tweets.capture();
SuccessLog.add(accessKey, pdf);
const html = nunjucks.render(`${TEMPLATES_PATH}success.njk`, {
pdfBase64: pdf.toString('base64'),
url: tweets.url
});
return reply
.code(200)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
catch(err) {
request.log.error(`Capture failed. ${err}`);
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "CAPTURE-ISSUE"
});
return reply
.code(500)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
// In any case: we need to decrease CAPTURES_WATCH counts.
finally {
CAPTURES_WATCH.currentTotal -= 1;
if (accessKey && accessKey in CAPTURES_WATCH.currentByAccessKey) {
CAPTURES_WATCH.currentByAccessKey[data["access-key"]] -= 1;
}
}
});
};

0
app/static/.keep 100644
Wyświetl plik

Plik binarny nie jest wyświetlany.

Wyświetl plik

Wyświetl plik

@ -0,0 +1,255 @@
* {
padding: 0px;
margin: 0px;
box-sizing: border-box;
}
:root {
--main-color: rgb(29, 29, 29);
--main-color-: rgb(57, 57, 57);
--main-color--: rgb(70, 70, 70);
--main-color---: rgb(100, 100, 100);
--main-color----: rgb(125, 125, 125);
--main-color-----: rgb(150, 150, 150);
--opposite-color: rgb(245, 245, 245);
--opposite-color-: rgb(225, 225, 225);
--opposite-color--: rgb(205, 205, 205);
--special-color: rgb(2, 56, 17);
--special-color-: rgb(2, 43, 13);
}
html {
font-size: 20px;
font-size: clamp(18px, 1.35vmax, 32px);
}
body {
font-family: Garamond, serif;
font-size: 1rem;
background-color: var(--main-color);
color: var(--main-color);
padding: 0.5rem;
}
a {
color: var(--special-color);
}
a:hover {
color: var(--special-color-);
text-decoration: none;
}
body > main {
padding: 1rem;
background-color: var(--opposite-color);
}
@media (max-width: 769px) {
body > main {
padding-left: 1.5rem;
padding-right: 1.5rem;
}
}
body > main h1 {
font-size: 2.85rem;
letter-spacing: -0.1rem;
text-align: center;
}
@media (max-width: 769px) {
body > main h1 {
font-size: 2.45rem;
}
}
body > main h2 {
font-size: 1.65rem;
letter-spacing: -0.05rem;
}
@media (max-width: 769px) {
body > main h2 {
font-size: 1.55rem;
}
}
body > main h1, h2, h3 {
font-weight: normal;
margin-bottom: 0.25rem;
}
body > main p {
line-height: 1.45rem;
}
/* INDEX - GENERAL*/
body#index > main {
min-height: 100vh;
}
body#index > main button {
font-family: Garamond, serif;
font-size: 0.85rem;
border: 0px;
background-color: var(--main-color);
color: var(--opposite-color);
padding: 0.5rem;
padding-left: 1rem;
padding-right: 1rem;
border-radius: 0.25rem;
cursor: pointer;
transition: background-color 0.35s ease-in-out;
}
body#index > main button:hover {
background-color: var(--main-color-);
}
/* INDEX - HEADER */
body#index > main > img:first-of-type {
display: block;
margin: auto;
margin-bottom: 1rem;
max-width: 60vw;
max-height: 62.5vh;
}
body#index > main header {
text-align: center;
}
/* INDEX - FORM */
body#index > main form {
max-width: 45ch;
display: block;
margin: auto;
border-top: 1px solid var(--main-color-----);
/*border-bottom: 1px solid var(--main-color-----);*/
padding-top: 1.5rem;
margin-bottom: 1rem;
margin-top: 1.5rem;
}
@media (max-width: 769px) {
body#index > main form {
max-width: unset;
}
}
body#index > main form fieldset {
border: 0px;
padding-top: 0.5rem;
padding-bottom: 0.5rem;
width: 100%;
}
body#index > main form label {
display: block;
font-size: 0.95rem;
color: var(--main-color);
padding-bottom: 0.25rem;
}
body#index > main form input {
display: block;
width: 100%;
padding: 0.5rem;
font-family: Garamond, serif;
font-size: 1rem;
background-color: white;
border: 1px solid var(--main-color-----);
border-radius: 0.25rem;
}
body#index > main form fieldset.submit {
text-align: right;
}
body#index > main form fieldset.submit a {
font-size: 0.85rem;
display: inline-block;
padding: 0.5rem;
}
/* INDEX - DIALOG */
body#index > main dialog {
display: none;
}
body#index > main dialog[open] {
display: unset;
margin: auto;
border: 0px;
width: 50ch;
max-width: 90%;
padding: 2rem;
border: 0.5rem solid var(--main-color);
}
body#index > main dialog[open] p {
font-size: 0.9rem;
line-height: 1.45rem;
margin-bottom: 1rem;
}
body#index > main dialog[open]::backdrop {
background-color: rgba(0,0,0,0.65);
overflow: hidden;
}
/* INDEX - EXPLAINER */
body#index > main section {
max-width: 45ch;
margin: auto;
padding-top: 1.5rem;
margin-bottom: 1.5rem;
border-top: 1px solid var(--main-color-----);
}
body#index > main section p {
margin-bottom: 0.5rem;
font-size: 0.95rem;
line-height: 1.45rem;
}
@media (max-width: 769px) {
body#index > main section {
max-width: unset;
}
}
/* INDEX - FOOTER */
body#index > main footer {
max-width: 45ch;
margin: auto;
text-align: center;
padding-top: 1.5rem;
margin-bottom: 1.5rem;
border-top: 1px solid var(--main-color-----);
}
body#index > main footer p {
margin-bottom: 0.5rem;
/*font-size: 0.95rem;*/
}
@media (max-width: 769px) {
body#index > main footer {
max-width: unset;
}
}
body#index > main footer img {
width: 10rem;
display: inline-block;
max-width: 50%;
}

Wyświetl plik

@ -0,0 +1,45 @@
//------------------------------------------------------------------------------
// "form-submit" dialog logic
//------------------------------------------------------------------------------
// Click on form submit button: check input validity, open "form-submit" dialog.
document.querySelector("body#index form button").addEventListener("click", (e) => {
e.preventDefault();
const url = document.querySelector("body#index form input[name='url']");
const accessKey = document.querySelector("body#index form input[name='access-key']")
if (!url.checkValidity()) {
url.reportValidity();
return;
}
if (!accessKey.checkValidity()) {
accessKey.reportValidity();
return;
}
document.querySelector("dialog#form-submit").showModal();
});
// Click on button in "form-submit" dialog: close dialog and submit form.
document.querySelector("dialog#form-submit button").addEventListener("click", (e) => {
e.preventDefault();
document.querySelector("dialog#form-submit").close();
document.querySelector("body#index form").submit();
});
//------------------------------------------------------------------------------
// "form-error" dialog logic
//------------------------------------------------------------------------------
// Open on load if present, close on button click.
const formErrorDialog = document.querySelector("dialog#form-error");
if (formErrorDialog) {
formErrorDialog.showModal();
formErrorDialog.querySelector("button").addEventListener("click", (e) => {
formErrorDialog.close();
})
}

40
app/static/lil.svg 100644

File diff suppressed because one or more lines are too long

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 18 KiB

File diff suppressed because one or more lines are too long

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 314 KiB

Wyświetl plik

@ -0,0 +1,119 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>archive.social</title>
<meta name="description" content="">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="/static/index.css">
<script type="module" src="/static/index.js"></script>
</head>
<body id="index">
<main>
<!-- HEADER -->
<img
src="/static/ocean-of-books.svg"
alt="Illustration of a person falling in an ocean of books.">
<header>
<h1>archive.social</h1>
<p>High-fidelity capture of Twitter threads as sealed PDFs.</p>
</header>
<!-- MAIN FORM -->
<form action="/" target="_blank" method="POST" enctype="application/x-www-form-urlencoded">
<fieldset>
<label for="url">Twitter.com url to capture</label>
<input type="url"
pattern="https://twitter.com/.*"
id="url"
name="url"
placeholder="https://twitter.com/..."
required>
</fieldset>
<fieldset>
<label for="access-key">Archive.social access key</label>
<input type="password"
name="access-key"
id="access-key"
required>
</fieldset>
<fieldset class="submit">
<a href="" target="_blank" rel="noreferer">Request an access key.</a>
<button>Capture</button>
</fieldset>
</form>
{% if error %}
<!-- ERROR DIALOG -->
<dialog id="form-error">
<h2>Something went wrong</h2>
{% if errorReason and errorReason == "ACCESS-KEY" %}
<p>The access key provided is invalid or no longer active.</p>
{% endif %}
{% if errorReason and errorReason == "URL" %}
<p>The url provided is not a valid twitter.com url.</p>
{% endif %}
{% if errorReason and errorReason == "TOO-MANY-CAPTURES-TOTAL" %}
<p>archive.social has received too many requests. Please retry in a minute.</p>
{% endif %}
{% if errorReason and errorReason == "TOO-MANY-CAPTURES-USER" %}
<p>Please wait until the capture requests you've started are completed before starting a new one.</p>
{% endif %}
{% if errorReason and errorReason == "CAPTURE-ISSUE" %}
<p>archive.social encountered an issue during the capture process itself. Please try again later.</p>
{% endif %}
<button>Ok</button>
</dialog>
{% endif %}
<!-- FORM SUBMIT DIALOG -->
<dialog id="form-submit">
<h2>Request a capture.</h2>
<p>Submitting this form will open a new tab, in which your request will be processed.</p>
<p>The capture process should take around a minute, at the end of which the resulting sealed PDF will be ready to be downloaded.</p>
<button>I understand, proceed.</button>
</dialog>
<!-- EXPLAINER -->
<section>
<h2>How does it work?</h2>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam.</p>
<p>Quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
<p><a href="/static/example.pdf">See an example</a>.</p>
</section>
<!-- FOOTER -->
<footer>
<p>
<a href="https://lil.law.harvard.edu">
<img src="/static/lil.svg" alt="Harvard Library Innovation Lab - Logo">
</a>
</p>
<p>archive.social is an experiment of the <a href="https://lil.law.harvard.edu">Harvard Library Innovation Lab</a>.</p>
</footer>
</main>
</body>
</html>

Wyświetl plik

@ -0,0 +1,28 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>archive.social | Capture complete.</title>
<meta name="description" content="" />
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<style>
* {
padding: 0px;
margin: 0px;
}
iframe {
display: block;
width: 100%;
height: 100vh;
border: 0px;
}
</style>
</head>
<body>
<iframe src="data:application/pdf;base64,{{ pdfBase64 }}"></iframe>
</body>
</html>

0
app/tmp/.keep 100644
Wyświetl plik

Wyświetl plik

@ -0,0 +1,40 @@
/**
* archive.social
* @module utils.AccessKeys
* @author The Harvard Library Innovation Lab
* @license MIT
*/
import fs from "fs";
import { DATA_PATH } from "../const.js";
/**
* Utility class for handling access keys to the app.
* [!] Needs replacement.
*/
export class AccessKeys {
/**
* Complete path to `access-keys.json`.
* @type {string}
*/
static filepath = `${DATA_PATH}access-keys.json`;
/**
* Tries to load access keys hashmap from disk.
* Creates empty file if none provided.
*
* @returns {object} - Frozen object
*/
static fetch() {
const filepath = AccessKeys.filepath;
try {
const keys = fs.readFileSync(filepath);
return Object.freeze(JSON.parse(keys));
}
catch (err) {
fs.writeFileSync(filepath, "{}");
}
}
}

Wyświetl plik

@ -0,0 +1,45 @@
/**
* archive.social
* @module utils.logCaptureSuccess
* @author The Harvard Library Innovation Lab
* @license MIT
*/
import fs from "fs";
import crypto from "crypto";
import { DATA_PATH } from "../const.js";
export class SuccessLog {
/**
* Complete path to `success-log.json`.
* @type {string}
*/
static filepath = `${DATA_PATH}success-log.tsv`;
/**
* Adds an entry to `success-log.json`.
* @param {*} accessKey
* @param {Buffer} pdfBytes - Used to store a SHA512 hash of the PDF that was delivered
*/
static add(accessKey, pdfBytes) {
// Create file if it does not exist
if (!fs.existsSync(SuccessLog.filepath)) {
SuccessLog.reset();
}
// Calculate SHA512 hash of the PDF
const pdfHash = crypto.createHash('sha512').update(pdfBytes).digest('base64');
// Save entry to file
const entry = `${new Date().toISOString()}\t${accessKey}\tsha512-${pdfHash}\n`;
fs.appendFileSync(SuccessLog.filepath, entry);
}
/**
* Resets `success-log.json`.
*/
static reset() {
fs.writeFileSync(SuccessLog.filepath, "date-time\taccess-key\thash\n");
}
}

Wyświetl plik

@ -0,0 +1,675 @@
/**
* archive.social
* @module utils.TwitterCapture
* @author The Harvard Library Innovation Lab
* @license MIT
*/
import fs from "fs";
import { spawnSync } from "child_process";
import { chromium } from "playwright";
import { v4 as uuidv4 } from "uuid";
import { PDFDocument } from "pdf-lib";
import { CERTS_PATH, TMP_PATH, EXECUTABLES_FOLDER } from "../const.js";
/**
* Generates a "sealed" PDF out of a twitter.com url using Playwright.
*
* Usage:
* ```
* const tweet = new TwitterCapture(url);
* const pdf = await tweet.capture();
* fs.writeFileSync("tweet.pdf", pdf);
* ```
*/
export class TwitterCapture {
/**
* Defaults for options that can be passed to `TwitterCapture`.
* @property {string} privateKeyPath - Path to `.pem` file containing a private key.
* @property {string} certPath - Path to a `.pem` file containing a certificate.
* @property {string} tmpFolderPath - Path to a folder in which temporary file can be written.
* @property {string} ytDlpPath - Path to the `yt-dlp` executable.
* @property {string} timestampServerUrl - Timestamping server.
* @property {number} networkidleTimeout - Time to wait for "networkidle" state.
* @property {number} browserBehaviorsTimeout - Maximum browser behaviors execution time.
* @property {number} videoCaptureTimeout - Maximum yt-dlp execution time.
* @property {number} renderTimeout - Time to wait for re-renders.
*/
static defaults = {
privateKeyPath: `${CERTS_PATH}key.pem`,
certPath: `${CERTS_PATH}cert.pem`,
tmpFolderPath: `${TMP_PATH}`,
ytDlpPath: `${EXECUTABLES_FOLDER}yt-dlp`,
timestampServerUrl: "http://timestamp.digicert.com",
networkidleTimeout: 5000,
browserBehaviorsTimeout: 30000,
videoCaptureTimeout: 10000,
renderTimeout: 2500
};
/** @type {object} - Based on TwitterCapture.defaults */
options = {};
/** @type {?string} */
url = null;
/** @type {?string} */
urlType = null;
/**
* @type {{
* browser: ?import('playwright').Browser,
* context: ?import('playwright').BrowserContext,
* page: ?import('playwright').Page,
* ready: boolean
* }}
*/
playwright = {
browser: null,
context: null,
page: null,
ready: false
};
/** @type {object<string, Buffer>} */
interceptedJPEGs = {};
/**
* @param {string} url - `twitter.com` url to capture. Works best on statuses and threads.
* @param {object} options - See `TwitterCapture.defaults` for detailed options. Will use defaults unless overridden.
*/
constructor(url, options = {}) {
this.filterUrl(url);
this.filterOptions(options);
}
/**
* Captures the current Twitter.com url and makes it a signed PDF.
* @returns {Promise<Buffer>} - Signed PDF.
*/
capture = async() => {
let rawPDF = null;
let editablePDF = null;
let editedPDF = null;
// Playwright init
await this.setup();
// Page load + network idle
try {
await this.playwright.page.goto(this.url, {
waitUntil: "networkidle",
timeout: this.options.networkidleTimeout,
});
}
catch(err) { /* Timeout errors are non-blocking */ }
// Adjust UI (#1)
await this.adjustUIForCapture();
// Run browser behaviors
if (this.urlType !== "profile") { // Skipped on profile pages
await this.runBrowserBehaviors();
}
else {
new Promise(resolve => setTimeout(resolve, this.options.networkidleTimeout));
}
// Wait for network idle
try {
await this.waitForLoadState("networkidle", {timeout: this.options.networkidleTimeout});
}
catch(err) { /* Timeout errors are non-blocking */ }
// Adjust UI (#2 - Accounts for re-renders)
await this.adjustUIForCapture();
// Resize browser to fit document dimensions
if (this.urlType !== "profile") { // Skipped on profile pages
await this.resizeViewportToFitDocument();
}
// Generate raw PDF and open editable PDF
rawPDF = await this.generateRawPDF();
editablePDF = await PDFDocument.load(rawPDF);
// Add intercepted JPEGs as attachments
await this.addInterceptedJPEGsToPDF(editablePDF);
// Remove extraneous page, add metadata
try {
editablePDF.setTitle(`Capture of ${this.url} by archive.social on ${new Date().toISOString()}`);
editablePDF.setCreationDate(new Date());
editablePDF.setModificationDate(new Date());
editablePDF.setProducer("archive.social");
editablePDF.removePage(1);
}
catch {
console.log(err);
}
// Try to capture video, if any, and add it as attachment
await this.captureAndAddVideoToPDF(editablePDF);
// Freeze edited PDF in memory
editedPDF = await editablePDF.save();
// Crop and compress
editedPDF = await this.cropMarginsOnPDF(editedPDF);
editedPDF = await this.compressPDF(editedPDF);
// Sign
editedPDF = await this.signPDF(editedPDF);
// Teardown
try {
await this.teardown();
}
catch { /* Ignore teardown errors */ }
// Return buffer
return editedPDF;
}
/**
* Sets up the browser used for capture as well as network interception for images capture.
* Populates `this.playwright` and `this.playwrightIsReady`;
* @returns {Promise<void>}
*/
setup = async() => {
const userAgent = chromium._playwright.devices["Pixel 2 XL"].userAgent;
const viewport = chromium._playwright.devices["Pixel 2 XL"].viewport;
this.playwright.browser = await chromium.launch({
headless: true,
channel: "chrome",
});
this.playwright.context = await this.playwright.browser.newContext({ userAgent });
this.playwright.page = await this.playwright.context.newPage();
this.playwright.page.setViewportSize(viewport);
this.playwright.page.on("response", this.interceptJpegs);
this.playwright.ready = true;
}
/**
* Closes browser used for capture.
*/
teardown = async() => {
await this.playwright.page.close();
await this.playwright.context.close();
await this.playwright.browser.close();
this.playwright.ready = true;
}
/**
* Adjusts the current page's DOM so the resulting PDF is not affected by UI artifact.
* Playwright needs to be ready.
*
* @returns {Promise<void>}
*/
adjustUIForCapture = async() => {
if (this.playwright.ready !== true) {
throw new Error("Playwright is not ready.");
}
await this.playwright.page.evaluate(async() => {
// Nav bar and header
document
.querySelector("div[data-testid='TopNavBar']")
?.setAttribute("style", "display: none;");
document
.querySelector("header")
?.setAttribute("style", "display: none;");
// Bottom bar
document
.querySelector("div[data-testid='BottomBar']")
?.setAttribute("style", "display: none;");
document
.querySelector("div[data-testid='BottomBar']")
?.parentNode
?.setAttribute("style", "display: none;");
document
.querySelector("div[data-testid='BottomBar']")
?.parentNode
?.parentNode
?.setAttribute("style", "display: none;");
// Full-screen dialog
document
.querySelector("div[role='dialog']")
?.setAttribute("style", "display: none;");
// "Log in" bar
document
.evaluate(
"//span[text()='Not now']",
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
)
?.singleNodeValue
?.click();
});
}
/**
* Runs browser behaviors:
* - Tries to scroll through the page.
* - Tries to click on the next available "Show replies" button.
*
* Playwright needs to be ready.
*
* @returns {Promise<void>}
*/
runBrowserBehaviors = async() => {
if (this.playwright.ready !== true) {
throw new Error("Playwright is not ready.");
}
try {
await Promise.race([
// Max execution time for the browser behaviors
new Promise((resolve) => setTimeout(resolve, this.options.browserBehaviorsTimeout)),
// Behaviors script
this.playwright.page.evaluate(async () => {
let scrollTop = document.documentElement.scrollTop;
while (true) {
// Auto scroll: +100px every 250ms
scrollTop += 100;
window.scrollTo({top: scrollTop});
await new Promise(resolve => setTimeout(resolve, 250));
// Auto click on first available "Show replies" button
let showRepliesButton = document.evaluate(
"//span[text()='Show replies']",
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
)?.singleNodeValue;
if (showRepliesButton) {
showRepliesButton.click();
await new Promise(resolve => setTimeout(resolve, 1500));
}
// Break when reaching bottom of page
if (scrollTop >= Math.max(document.body.scrollHeight, window.outerHeight)) {
break;
}
}
})
])
}
catch(err) {
// Ignore behavior errors.
}
}
/**
* Stretches the viewport to match the document's dimensions.
* @returns {Promise<void>}
*/
resizeViewportToFitDocument = async() => {
const viewport = await this.getDocumentDimensions();
await this.playwright.page.setViewportSize({
width: viewport.width,
height: viewport.height
});
await new Promise(resolve => setTimeout(resolve, this.options.renderTimeout));
}
/**
* Returns the current dimensions of the document.
* Playwright needs to be ready.
* @returns {Promise<{width: number, height: number}>}
*/
getDocumentDimensions = async() => {
if (this.playwright.ready !== true) {
throw new Error("Playwright is not ready.");
}
return await this.playwright.page.evaluate(() => {
const width = Math.max(document.body.scrollWidth, window.outerWidth);
const height = Math.max(document.body.scrollHeight, window.outerHeight);
return {width, height}
});
}
/**
* Uses Playwright's network interception to capture images and add them to `this.interceptedJPEGs`.
* Called whenever Playwright processes an HTTP response.
*
* @param {import('playwright').Response} response
* @returns {Promise<void>}
*/
interceptJpegs = async(response) => {
try {
const headers = await response.allHeaders();
if (["image/jpeg"].includes(headers["content-type"])) {
const image = await response.body();
const url = await response.url();
this.interceptedJPEGs[url] = image;
}
}
catch (err) {
// Some exchanges can't be captured, and that's okay :).
}
}
/**
* Generates a PDF of the current page using Chrome Dev Tools.
* Playwright needs to be ready.
*
* Populates `this.pdf`.
*
* @returns {Promise<Buffer>} - PDF Bytes
*/
generateRawPDF = async() => {
if (this.playwright.ready !== true) {
throw new Error("Playwright is not ready.");
}
// Scroll up and wait ~2s
await this.playwright.page.evaluate(() => window.scrollTo({top: 0}));
await new Promise(resolve => setTimeout(resolve, this.options.renderTimeout));
// Generate document
await this.playwright.page.emulateMedia({media: 'screen'});
const dimensions = await this.getDocumentDimensions();
return await this.playwright.page.pdf({
printBackground: true,
width: dimensions.width,
height: dimensions.height
});
}
/**
* Adds entries from `this.interceptedJPEGs`
* @param {PDFDocument} - Editable PDF object from `pdf-lib`.
* @returns {Promise<void>}
*/
addInterceptedJPEGsToPDF = async(editablePDF) => {
for (const [url, buffer] of Object.entries(this.interceptedJPEGs)) {
const parsedUrl = new URL(url);
let filename = `${parsedUrl.pathname}${parsedUrl.search}`.replaceAll("/", "-");
if (!filename.endsWith(".jpg")) {
filename += ".jpg";
}
if (filename.startsWith("-")) {
filename = filename.substring(1);
}
await editablePDF.attach(buffer, filename, {
mimeType: 'image/jpeg',
description: `Image captured from ${this.url}`,
creationDate: new Date(),
modificationDate: new Date(),
});
}
}
/**
* Tries to capture main video from current Twitter url and add it as attachment to the PDF.
* @param {PDFDocument} - Editable PDF object from `pdf-lib`.
* @returns {Promise<void>}
*/
captureAndAddVideoToPDF = async(editablePDF) => {
const id = uuidv4();
const filepathOut = `${this.options.tmpFolderPath}${id}.mp4`;
const ytDlpExecutable = this.options.ytDlpPath;
// yt-dlp health check
try {
const result = spawnSync(ytDlpExecutable, ["--version"], {encoding: "utf8"});
if (result.status !== 0) {
throw new Error(result.stderr);
}
const version = result.stdout.trim();
if (!version.match(/^[0-9]{4}\.[0-9]{2}\.[0-9]{2}$/)) {
throw new Error(`Unknown version: ${version}`);
}
}
catch(err) {
throw new Error(`"yt-dlp" executable is not available or cannot be executed. ${err}`);
}
// Capture
try {
const dlpOptions = [
"--no-warnings", // Prevents pollution of stdout
"--no-progress", // (Same as above)
"--format", "mp4", // Forces .mp4 format
"--output", filepathOut,
this.url
];
const spawnOptions = {
timeout: this.options.videoCaptureTimeout,
encoding: "utf8",
};
const result = spawnSync(ytDlpExecutable, dlpOptions, spawnOptions);
if (result.status !== 0) {
throw new Error(result.stderr);
}
const video = fs.readFileSync(filepathOut);
await editablePDF.attach(video, "video.mp4", {
mimeType: 'video/mp4',
description: `Video captured from ${this.url}`,
creationDate: new Date(),
modificationDate: new Date(),
});
fs.unlink(filepathOut, () => {});
}
catch(err) { }
}
/**
* @param {Buffer} editedPDF - PDF Bytes
* @returns {Buffer} - PDF Bytes
*/
cropMarginsOnPDF = async(editedPDF) => {
// Save PDF to disk
const id = uuidv4();
const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
fs.writeFileSync(filepathIn, editedPDF);
// Apply cropping
const run = spawnSync(
"pdf-crop-margins",
["-p", "0", "-a", "-20", "-o", filepathOut, filepathIn],
{ encoding: "utf-8" }
);
if (run.status !== 0) {
throw new Error(run.stderr);
}
// Load cropped file from disk and return
editedPDF = fs.readFileSync(filepathOut);
fs.unlink(filepathIn, () => {});
fs.unlink(filepathOut, () => {});
return editedPDF;
}
/**
* @param {Buffer} editedPDF - PDF Bytes
* @returns {Buffer} - PDF Bytes
*/
compressPDF = async(editedPDF) => {
// Save PDF to disk
const id = uuidv4();
const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
fs.writeFileSync(filepathIn, editedPDF);
const run = spawnSync("gs", [
"-sDEVICE=pdfwrite",
"-dNOPAUSE",
"-dBATCH",
"-dJPEGQ=90",
"-r150",
`-sOutputFile=${filepathOut}`,
`${filepathIn}`,
]);
if (run.status !== 0) {
throw new Error(run.stderr);
}
// Load compressed file from disk and return
editedPDF = fs.readFileSync(filepathOut);
fs.unlink(filepathIn, () => {});
fs.unlink(filepathOut, () => {});
return editedPDF;
}
/**
* @param {Buffer} editedPDF - PDF Bytes
* @returns {Buffer} - PDF Bytes
*/
signPDF = async(editedPDF) => {
// Save PDF to disk
const id = uuidv4();
const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
fs.writeFileSync(filepathIn, editedPDF);
const run = spawnSync("pyhanko",
[
"sign",
"addsig",
"--field", "Sig1",
"--timestamp-url", this.options.timestampServerUrl,
"pemder",
"--key", this.options.privateKeyPath,
"--cert", this.options.certPath,
"--no-pass",
filepathIn,
filepathOut
],
{encoding: "utf-8"});
if (run.status !== 0) {
throw new Error(run.stderr);
}
// Load signed file from disk and return
editedPDF = fs.readFileSync(filepathOut);
fs.unlink(filepathIn, () => {});
fs.unlink(filepathOut, () => {});
return editedPDF;
}
/**
* Applies some basic filtering to new option objects and fills gaps with defaults.
* Replaces `this.options` after filtering.
*
* @param {Promise<object>} newOptions
*/
filterOptions = async(newOptions) => {
const options = {};
const defaults = TwitterCapture.defaults;
for (const key of Object.keys(defaults)) {
options[key] = key in newOptions ? newOptions[key] : defaults[key];
switch (typeof defaults[key]) {
case "boolean":
options[key] = Boolean(options[key]);
break;
case "number":
options[key] = Number(options[key]);
break;
case "string":
options[key] = String(options[key]);
break;
}
}
this.options = options;
}
/**
* Filters a given URL to ensure it's a `twitter.com` one.
* Also asserts it's "type": "status", "search", "profile".
*
* Automatically populates `this.url` and `this.urlType`.
*
* @param {string} url
* @returns {bool}
*/
filterUrl = (url) => {
/** @type {?URL} */
let parsedUrl = null;
/** @type {?string} */
let urlType = null;
//
// Determine if `url` is a valid `twitter.com` and remove known tracking params
//
try {
parsedUrl = new URL(url); // Will throw if not a valid url.
if (parsedUrl.origin !== "https://twitter.com") {
throw new Error();
}
parsedUrl.searchParams.delete("s");
parsedUrl.searchParams.delete("t");
parsedUrl.searchParams.delete("ctx");
}
catch (err) {
throw new Error(`${url} is not a valid Twitter url.`);
}
//
// Determine Twitter url "type"
//
if (parsedUrl.pathname.includes("/status/")) {
urlType = "status";
}
else if (parsedUrl.pathname.includes("/search")) {
urlType = "search";
}
else {
urlType = "profile";
}
this.url = parsedUrl.href;
this.urlType = urlType;
return true;
}
}

11
app/utils/index.js 100644
Wyświetl plik

@ -0,0 +1,11 @@
/**
* archive.social
* @module utils
* @author The Harvard Library Innovation Lab
* @license MIT
*/
import { AccessKeys } from "./AccessKeys.js";
import { TwitterCapture } from "./TwitterCapture.js";
import { SuccessLog } from "./SuccessLog.js";
export { AccessKeys, SuccessLog, TwitterCapture };

5
brewfile 100644
Wyświetl plik

@ -0,0 +1,5 @@
# For use on Mac OS for development purposes only.
brew "curl"
brew "openssl"
brew "ghostscript"
brew "poppler"

0
certs/.keep 100644
Wyświetl plik

Wyświetl plik

2810
package-lock.json wygenerowano 100644

Plik diff jest za duży Load Diff

88
package.json 100644
Wyświetl plik

@ -0,0 +1,88 @@
{
"name": "archive.social",
"version": "0.0.1",
"description": "",
"main": "app.js",
"type": "module",
"scripts": {
"start": "fastify start app/server.js",
"dev": "fastify start app/server.js -l info -w",
"postinstall": "cd scripts && bash download-yt-dlp.sh && bash pip-install.sh",
"generate-local-cert": "cd scripts && bash generate-local-cert.sh",
"test": "echo \"Error: no test specified\" && exit 1"
},
"dependencies": {
"@fastify/formbody": "^7.3.0",
"@fastify/static": "^6.5.0",
"abort-controller": "^3.0.0",
"abstract-logging": "^2.0.1",
"ajv": "^8.11.2",
"ajv-formats": "^2.1.1",
"archy": "^1.0.0",
"atomic-sleep": "^1.0.0",
"avvio": "^8.2.0",
"base64-js": "^1.5.1",
"buffer": "^6.0.3",
"cookie": "^0.5.0",
"debug": "^4.3.4",
"event-target-shim": "^5.0.1",
"events": "^3.3.0",
"fast-decode-uri-component": "^1.0.1",
"fast-deep-equal": "^3.1.3",
"fast-json-stringify": "^5.4.1",
"fast-querystring": "^1.0.0",
"fast-redact": "^3.1.2",
"fast-uri": "^2.1.0",
"fastify": "^4.9.2",
"fastify-cli": "^5.6.0",
"fastq": "^1.13.0",
"find-my-way": "^7.3.1",
"forwarded": "^0.2.0",
"ieee754": "^1.2.1",
"ipaddr.js": "^1.9.1",
"json-schema-traverse": "^1.0.0",
"light-my-request": "^5.6.1",
"lru-cache": "^6.0.0",
"ms": "^2.1.2",
"nunjucks": "^3.2.3",
"on-exit-leak-free": "^2.1.0",
"pdf-lib": "^1.17.1",
"pino": "^8.7.0",
"pino-abstract-transport": "^1.0.0",
"pino-std-serializers": "^6.0.0",
"playwright": "^1.27.1",
"process": "^0.11.10",
"process-warning": "^2.0.0",
"proxy-addr": "^2.0.7",
"punycode": "^2.1.1",
"quick-format-unescaped": "^4.0.4",
"readable-stream": "^4.2.0",
"real-require": "^0.2.0",
"require-from-string": "^2.0.2",
"ret": "^0.2.2",
"reusify": "^1.0.4",
"rfdc": "^1.3.0",
"safe-regex2": "^2.0.0",
"safe-stable-stringify": "^2.4.1",
"secure-json-parse": "^2.5.0",
"semver": "^7.3.8",
"set-cookie-parser": "^2.5.1",
"sonic-boom": "^3.2.0",
"split2": "^4.1.0",
"thread-stream": "^2.2.0",
"tiny-lru": "^9.0.3",
"uri-js": "^4.4.1",
"uuid": "^9.0.0",
"yallist": "^4.0.0"
},
"repository": {
"type": "git",
"url": "git+https://github.com/harvard-lil/archive.social.git"
},
"author": "",
"license": "ISC",
"bugs": {
"url": "https://github.com/harvard-lil/archive.social/issues"
},
"homepage": "https://github.com/harvard-lil/archive.social#readme"
}

Wyświetl plik

@ -0,0 +1,3 @@
# Pulls yt-dlp (2022.10.04 version) and saves it in `executables`.
curl -L https://github.com/yt-dlp/yt-dlp/releases/download/2022.10.04/yt-dlp > ../executables/yt-dlp;
chmod a+x ../executables/yt-dlp;

Wyświetl plik

@ -0,0 +1,3 @@
# Generates a local key pair that can be used for signing PDFs.
# Will be saved under ../certs.
openssl req -x509 -newkey rsa:4096 -keyout ../certs/key.pem -out ../certs/cert.pem -days 3650 -nodes -subj /CN="archive.social";

Wyświetl plik

@ -0,0 +1,2 @@
pip3 install "pyHanko[pkcs11,image-support,opentype,xmp]"==0.15.1;
pip3 install pdfCropMargins==1.0.9;