Clean up and documentation

pull/3/head
Matteo Cargnelutti 2022-11-21 12:30:34 -05:00
rodzic 3a4e51cca9
commit fa7651271f
17 zmienionych plików z 2367 dodań i 182 usunięć

Wyświetl plik

@ -1,11 +1,18 @@
# archive.social
> 🚧 Work In Progress
# archive.social 📚
High-fidelity capture of Twitter threads as sealed PDFs - [archive.social](https://archive.social).
[Archive.social](https://archive.social) is an experiment of the [Harvard Library Innovation Lab](https://lil.law.harvard.edu).
> 🚧 Experimental / Prototype. Early release to be consolidated.
---
## Summary
- [Dependencies](#dependencies)
- [Local development](#local-development)
- [Dev CLI](#dev-cli)
- [Code docs](/docs)
---
@ -13,7 +20,7 @@
### Runtimes
- [Node.js](https://nodejs.org/) 18+
- [Python](https://www.python.org/) 3.9+.
- [Python](https://www.python.org/) 3.9+
### Browsers
- Google Chrome _(`npx playwright install --force chrome` may be used)_.
@ -21,12 +28,13 @@
### Python dependencies
- ⚠️ For now: Python dependencies are installed at machine level, as a post-install step of `npm install`.
### Known Debian / Ubuntu packages
### Known Ubuntu packages
```
curl bash gcc g++ python3 python3-pip python3-dev zlib1g zlib1g-dev libjpeg-dev libssl-dev libffi-dev ghostscript poppler-utils
```
Node may be sourced from [Nodesource](https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions).
- ⚠️ This project is only compatible with Ubuntu at the time, because it uses Playwright + Chrome.
- Node may be sourced from [Nodesource](https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions).
### For development on Mac OS
A `brewfile` is available. Run `brew bundle` to install machine-level dependencies that can be provided by [homebrew](https://brew.sh/).
@ -37,12 +45,44 @@ A `brewfile` is available. Run `brew bundle` to install machine-level dependenci
## Local development
> 🚧 WIP
Run the following commands to initialize the project and start the development server.
```bash
brew bundle # (Mac OS only) - See Linux dependencies above.
npm install
npx playwright install chrome
npm run generate-local-cert # Will generate a certificate for self-signing PDFs
npm install # To install npm packages
npx playwright install chrome # To ensure Playwright has a version of Chrome to talk to
npm run generate-dev-cert # Will generate a certificate for self-signing PDFs. For testing purposes only.
npm run dev # Starts the development server on port 3000
```
---
## Dev CLI
### start
```bash
npm run start
```
Starts the app's server on port 3000 with warning-level logs.
### dev
```bash
npm run dev
```
Starts the app's server on port 3000 with info-level logs. Watches for file changes.
### generate-dev-cert
```bash
npm run generate-dev-cert
```
Generate a `certs/cert.pem` and `certs/key.pem` for local development purposes.
### docgen
```bash
npm run docgen
```
Generates JSDoc-based code documentation under `/docs`.

Wyświetl plik

@ -13,6 +13,7 @@ export const CERTS_PATH = process.env.CERTS_PATH ? process.env.CERTS_PATH : `${p
/**
* Path to the "data" folder.
* @constant
*/
export const DATA_PATH = process.env.DATA_PATH ? process.env.DATA_PATH : `${process.env.PWD}/app/data/`;
@ -24,25 +25,30 @@ export const TMP_PATH = `${process.env.PWD}/app/tmp/`;
/**
* Path to the "templates" folder.
* @constant
*/
export const TEMPLATES_PATH = `${process.env.PWD}/app/templates/`;
/**
* Path to the "executables" folder.
* Path to the "executables" folder, for dependencies that are meant to be executed directly, such as `yt-dlp`.
* @constant
*/
export const EXECUTABLES_FOLDER = `${process.env.PWD}/executables/`;
export const EXECUTABLES_FOLDER = `${process.env.PWD}/executables/`;
/**
* Path to the "static" folder.
* @constant
*/
export const STATIC_PATH = `${process.env.PWD}/app/static/`;
/**
* Maximum capture processes that can be run in parallel.
* @constant
*/
export const MAX_PARALLEL_CAPTURES_TOTAL = 200;
/**
* Maximum capture processes that can be run in parallel for a given key.
* @constant
*/
export const MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY = 20;

Wyświetl plik

@ -27,10 +27,10 @@ const successLog = new SuccessLog();
const accessKeys = new AccessKeys();
/**
* Keeps track of how many capture processes are currently running.
* May be used to redirect users if over capacity.
* Keeps track of how many capture processes are currently running.
* May be used to redirect users if over capacity.
*
* [!] This needs to be upgraded to proper rate limiting after launch.
* [!] Only good for early prototyping.
*
* @type {{
* currentTotal: number,
@ -46,170 +46,193 @@ const CAPTURES_WATCH = {
maxPerAccessKey: MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY,
}
export default async function (fastify, opts) {
/**
* [GET] /
* Shows the landing page / form.
* Assumes `fastify` is in scope.
*
* @param {fastify.FastifyRequest} request
* @param {fastify.FastifyReply} reply
* @returns {Promise<fastify.FastifyReply>}
*/
async function index(request, reply) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`);
return reply
.code(200)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
/**
* [POST] `/`
* Processes a request to capture a `twitter.com` url.
* Serves PDF bytes directly if operation is successful.
* Returns to form with specific error code, passed as `errorReason`, otherwise.
* Assumes `fastify` is in scope.
*
* @param {fastify.FastifyRequest} request
* @param {fastify.FastifyReply} reply
* @returns {Promise<fastify.FastifyReply>}
*/
async function capture(request, reply) {
const data = request.body;
const accessKey = data["access-key"];
request.log.info(`Capture capacity: ${CAPTURES_WATCH.currentTotal} / ${CAPTURES_WATCH.maxTotal}.`);
//
// Check access key
//
if (!accessKeys.check(accessKey)) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "ACCESS-KEY"
});
return reply
.code(401)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check url
//
try {
const url = new URL(data.url);
assert(url.origin === "https://twitter.com");
}
catch(err) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "URL"
});
return reply
.code(400)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check that there is still capture capacity (total)
//
if (CAPTURES_WATCH.currentTotal >= CAPTURES_WATCH.maxTotal) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "TOO-MANY-CAPTURES-TOTAL"
});
return reply
.code(503)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check that there is still capture capacity (for this access key)
//
if (CAPTURES_WATCH.currentByAccessKey[accessKey] >= CAPTURES_WATCH.maxPerAccessKey) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "TOO-MANY-CAPTURES-USER"
});
return reply
.code(429)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Process capture request
//
try {
CAPTURES_WATCH.currentTotal += 1;
if (accessKey in CAPTURES_WATCH.currentByAccessKey) {
CAPTURES_WATCH.currentByAccessKey[accessKey] += 1;
}
else {
CAPTURES_WATCH.currentByAccessKey[accessKey] = 1;
}
const tweets = new TwitterCapture(data.url, {runBrowserBehaviors: "auto-scroll" in data});
const pdf = await tweets.capture();
successLog.add(accessKey, pdf);
return reply
.code(200)
.header('Content-Type', 'application/pdf')
.header('Content-Disposition', 'attachment; filename="capture.pdf"')
.send(pdf);
}
catch(err) {
request.log.error(`Capture failed. ${err}`);
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "CAPTURE-ISSUE"
});
return reply
.code(500)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
// In any case: we need to decrease CAPTURES_WATCH counts.
finally {
CAPTURES_WATCH.currentTotal -= 1;
if (accessKey && accessKey in CAPTURES_WATCH.currentByAccessKey) {
CAPTURES_WATCH.currentByAccessKey[data["access-key"]] -= 1;
}
}
}
/**
* [GET] `/api/v1/hashes/check/<sha512-hash>`.
* Checks if a given SHA512 hash is in the "success" logs, meaning this app created it.
* Hash is passed as the last parameter, url encoded.
* Assumes `fastify` is in scope.
*
* Returns HTTP 200 if found, HTTP 404 if not.
*
* @param {fastify.FastifyRequest} request
* @param {fastify.FastifyReply} reply
* @returns {Promise<fastify.FastifyReply>}
*/
async function checkHash(request, reply) {
let found = false;
const { hash } = request.params;
if (hash.length === 95 || hash.length === 88) {
found = successLog.findHashInLogs(hash);
}
return reply.code(found ? 200 : 404).send();
}
export default async function (fastify, opts) {
// Adds support for `application/x-www-form-urlencoded`
fastify.register(import('@fastify/formbody'));
// Serves files from STATIC_PATH
// Serves files from `STATIC_PATH`
fastify.register(import('@fastify/static'), {
root: STATIC_PATH,
prefix: '/static/',
});
/**
* [GET] /
* Shows the landing page / form.
*/
fastify.get('/', async (request, reply) => {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`);
// [GET] /
fastify.get('/', index);
return reply
.code(200)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
});
// [POST] /
fastify.post('/', capture);
/**
* [POST] /
* Processes a request to capture a twitter url.
* Serves PDF bytes directly if operation is successful.
* Returns to form with specific error code, passed as `errorReason`, otherwise.
*/
fastify.post('/', async (request, reply) => {
const data = request.body;
const accessKey = data["access-key"];
request.log.info(`Capture capacity: ${CAPTURES_WATCH.currentTotal} / ${CAPTURES_WATCH.maxTotal}.`);
//
// Check access key
//
if (!accessKeys.check(accessKey)) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "ACCESS-KEY"
});
return reply
.code(401)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check url
//
try {
const url = new URL(data.url);
assert(url.origin === "https://twitter.com");
}
catch(err) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "URL"
});
return reply
.code(400)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check that there is still capture capacity (total)
//
if (CAPTURES_WATCH.currentTotal >= CAPTURES_WATCH.maxTotal) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "TOO-MANY-CAPTURES-TOTAL"
});
return reply
.code(503)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Check that there is still capture capacity (for this access key)
//
if (CAPTURES_WATCH.currentByAccessKey[accessKey] >= CAPTURES_WATCH.maxPerAccessKey) {
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "TOO-MANY-CAPTURES-USER"
});
return reply
.code(429)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
//
// Process capture request
//
try {
CAPTURES_WATCH.currentTotal += 1;
if (accessKey in CAPTURES_WATCH.currentByAccessKey) {
CAPTURES_WATCH.currentByAccessKey[accessKey] += 1;
}
else {
CAPTURES_WATCH.currentByAccessKey[accessKey] = 1;
}
const tweets = new TwitterCapture(data.url, {runBrowserBehaviors: "auto-scroll" in data});
const pdf = await tweets.capture();
successLog.add(accessKey, pdf);
return reply
.code(200)
.header('Content-Type', 'application/pdf')
.header('Content-Disposition', 'attachment; filename="capture.pdf"')
.send(pdf);
}
catch(err) {
request.log.error(`Capture failed. ${err}`);
const html = nunjucks.render(`${TEMPLATES_PATH}index.njk`, {
error: true,
errorReason: "CAPTURE-ISSUE"
});
return reply
.code(500)
.header('Content-Type', 'text/html; charset=utf-8')
.send(html);
}
// In any case: we need to decrease CAPTURES_WATCH counts.
finally {
CAPTURES_WATCH.currentTotal -= 1;
if (accessKey && accessKey in CAPTURES_WATCH.currentByAccessKey) {
CAPTURES_WATCH.currentByAccessKey[data["access-key"]] -= 1;
}
}
});
/**
* [GET] /api/v1/hashes/check/<sha512-hash>
* Checks if a given SHA512 hash is in the "success" logs, meaning this app created it.
* Hash is passed as the last parameter, url encoded.
*
* Returns HTTP 200 if found, HTTP 404 if not.
*/
fastify.get('/api/v1/hashes/check/:hash', async (request, reply) => {
let found = false;
const { hash } = request.params;
if (hash.length === 95 || hash.length === 88) {
found = successLog.findHashInLogs(hash);
}
return reply.code(found ? 200 : 404).send();
});
// [GET] /api/v1/hashes/check/:hash
fastify.get('/api/v1/hashes/check/:hash', checkHash);
};

Wyświetl plik

@ -13,7 +13,7 @@ import { DATA_PATH } from "../const.js";
/**
* Utility class for handling access keys to the app.
* [!] Needs replacement.
* [!] For alpha launch only.
*/
export class AccessKeys {

Wyświetl plik

@ -50,8 +50,8 @@ export class TwitterCapture {
renderTimeout: 4000,
};
/** @type {object} - Based on TwitterCapture.defaults */
options = {};
/** @type {object} */
options = {}; // Based on TwitterCapture.defaults
/** @type {?string} */
url = null;
@ -61,9 +61,9 @@ export class TwitterCapture {
/**
* @type {{
* browser: ?import('playwright').Browser,
* context: ?import('playwright').BrowserContext,
* page: ?import('playwright').Page,
* browser: playwright.Browser,
* context: playwright.BrowserContext,
* page: playwright.Page,
* viewport: ?{width: number, height: number},
* ready: boolean
* }}
@ -219,7 +219,7 @@ export class TwitterCapture {
}
/**
* Adjusts the current page's DOM so the resulting PDF is not affected by UI artifact.
* Adjusts the current page's DOM so the resulting PDF is not affected by UI artifact.
* Playwright needs to be ready.
*
* @returns {Promise<void>}
@ -366,7 +366,7 @@ export class TwitterCapture {
* Uses Playwright's network interception to capture images and add them to `this.interceptedJPEGs`.
* Called whenever Playwright processes an HTTP response.
*
* @param {import('playwright').Response} response
* @param {playwright.Response} response
* @returns {Promise<void>}
*/
interceptJpegs = async(response) => {

66
docs/const.md 100644
Wyświetl plik

@ -0,0 +1,66 @@
<a name="const.module_js"></a>
## js
archive.social
**Author**: The Harvard Library Innovation Lab
**License**: MIT
* [js](#const.module_js)
* [.CERTS_PATH](#const.module_js.CERTS_PATH)
* [.DATA_PATH](#const.module_js.DATA_PATH)
* [.TMP_PATH](#const.module_js.TMP_PATH)
* [.TEMPLATES_PATH](#const.module_js.TEMPLATES_PATH)
* [.EXECUTABLES_FOLDER](#const.module_js.EXECUTABLES_FOLDER)
* [.STATIC_PATH](#const.module_js.STATIC_PATH)
* [.MAX_PARALLEL_CAPTURES_TOTAL](#const.module_js.MAX_PARALLEL_CAPTURES_TOTAL)
* [.MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY](#const.module_js.MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY)
<a name="const.module_js.CERTS_PATH"></a>
### js.CERTS\_PATH
Path to the folder holding the certificates used for signing the PDFs.
**Kind**: static constant of [<code>js</code>](#const.module_js)
<a name="const.module_js.DATA_PATH"></a>
### js.DATA\_PATH
Path to the "data" folder.
**Kind**: static constant of [<code>js</code>](#const.module_js)
<a name="const.module_js.TMP_PATH"></a>
### js.TMP\_PATH
Path to the folder in which temporary files will be written by the app.
**Kind**: static constant of [<code>js</code>](#const.module_js)
<a name="const.module_js.TEMPLATES_PATH"></a>
### js.TEMPLATES\_PATH
Path to the "templates" folder.
**Kind**: static constant of [<code>js</code>](#const.module_js)
<a name="const.module_js.EXECUTABLES_FOLDER"></a>
### js.EXECUTABLES\_FOLDER
Path to the "executables" folder, for dependencies that are meant to be executed directly, such as `yt-dlp`.
**Kind**: static constant of [<code>js</code>](#const.module_js)
<a name="const.module_js.STATIC_PATH"></a>
### js.STATIC\_PATH
Path to the "static" folder.
**Kind**: static constant of [<code>js</code>](#const.module_js)
<a name="const.module_js.MAX_PARALLEL_CAPTURES_TOTAL"></a>
### js.MAX\_PARALLEL\_CAPTURES\_TOTAL
Maximum capture processes that can be run in parallel.
**Kind**: static constant of [<code>js</code>](#const.module_js)
<a name="const.module_js.MAX_PARALLEL_CAPTURES_PER_ACCESS_KEY"></a>
### js.MAX\_PARALLEL\_CAPTURES\_PER\_ACCESS\_KEY
Maximum capture processes that can be run in parallel for a given key.
**Kind**: static constant of [<code>js</code>](#const.module_js)

80
docs/server.md 100644
Wyświetl plik

@ -0,0 +1,80 @@
<a name="server.module_js"></a>
## js
archive.social
**Author**: The Harvard Library Innovation Lab
**License**: MIT
* [js](#server.module_js)
* [~successLog](#server.module_js..successLog) : <code>SuccessLog</code>
* [~accessKeys](#server.module_js..accessKeys) : <code>AccessKeys</code>
* [~CAPTURES_WATCH](#server.module_js..CAPTURES_WATCH) : <code>Object</code>
* [~index(request, reply)](#server.module_js..index) ⇒ <code>Promise.&lt;fastify.FastifyReply&gt;</code>
* [~capture(request, reply)](#server.module_js..capture) ⇒ <code>Promise.&lt;fastify.FastifyReply&gt;</code>
* [~checkHash(request, reply)](#server.module_js..checkHash) ⇒ <code>Promise.&lt;fastify.FastifyReply&gt;</code>
<a name="server.module_js..successLog"></a>
### js~successLog : <code>SuccessLog</code>
**Kind**: inner constant of [<code>js</code>](#server.module_js)
<a name="server.module_js..accessKeys"></a>
### js~accessKeys : <code>AccessKeys</code>
**Kind**: inner constant of [<code>js</code>](#server.module_js)
<a name="server.module_js..CAPTURES_WATCH"></a>
### js~CAPTURES\_WATCH : <code>Object</code>
Keeps track of how many capture processes are currently running.
May be used to redirect users if over capacity.
[!] Only good for early prototyping.
**Kind**: inner constant of [<code>js</code>](#server.module_js)
<a name="server.module_js..index"></a>
### js~index(request, reply) ⇒ <code>Promise.&lt;fastify.FastifyReply&gt;</code>
[GET] /
Shows the landing page / form.
Assumes `fastify` is in scope.
**Kind**: inner method of [<code>js</code>](#server.module_js)
| Param | Type |
| --- | --- |
| request | <code>fastify.FastifyRequest</code> |
| reply | <code>fastify.FastifyReply</code> |
<a name="server.module_js..capture"></a>
### js~capture(request, reply) ⇒ <code>Promise.&lt;fastify.FastifyReply&gt;</code>
[POST] `/`
Processes a request to capture a `twitter.com` url.
Serves PDF bytes directly if operation is successful.
Returns to form with specific error code, passed as `errorReason`, otherwise.
Assumes `fastify` is in scope.
**Kind**: inner method of [<code>js</code>](#server.module_js)
| Param | Type |
| --- | --- |
| request | <code>fastify.FastifyRequest</code> |
| reply | <code>fastify.FastifyReply</code> |
<a name="server.module_js..checkHash"></a>
### js~checkHash(request, reply) ⇒ <code>Promise.&lt;fastify.FastifyReply&gt;</code>
[GET] `/api/v1/hashes/check/<sha512-hash>`.
Checks if a given SHA512 hash is in the "success" logs, meaning this app created it.
Hash is passed as the last parameter, url encoded.
Assumes `fastify` is in scope.
Returns HTTP 200 if found, HTTP 404 if not.
**Kind**: inner method of [<code>js</code>](#server.module_js)
| Param | Type |
| --- | --- |
| request | <code>fastify.FastifyRequest</code> |
| reply | <code>fastify.FastifyReply</code> |

Wyświetl plik

@ -0,0 +1,51 @@
<a name="utils.module_AccessKeys"></a>
## AccessKeys
archive.social
**Author**: The Harvard Library Innovation Lab
**License**: MIT
* [AccessKeys](#utils.module_AccessKeys)
* [.AccessKeys](#utils.module_AccessKeys.AccessKeys)
* [new exports.AccessKeys()](#new_utils.module_AccessKeys.AccessKeys_new)
* [.filepath](#utils.module_AccessKeys.AccessKeys+filepath) : <code>string</code>
* [.check(accessKey)](#utils.module_AccessKeys.AccessKeys+check)
<a name="utils.module_AccessKeys.AccessKeys"></a>
### AccessKeys.AccessKeys
Utility class for handling access keys to the app.
[!] For alpha launch only.
**Kind**: static class of [<code>AccessKeys</code>](#utils.module_AccessKeys)
* [.AccessKeys](#utils.module_AccessKeys.AccessKeys)
* [new exports.AccessKeys()](#new_utils.module_AccessKeys.AccessKeys_new)
* [.filepath](#utils.module_AccessKeys.AccessKeys+filepath) : <code>string</code>
* [.check(accessKey)](#utils.module_AccessKeys.AccessKeys+check)
<a name="new_utils.module_AccessKeys.AccessKeys_new"></a>
#### new exports.AccessKeys()
On init:
- Create access keys file if it doesn't exist
- Load keys from file into `this.#keys`.
<a name="utils.module_AccessKeys.AccessKeys+filepath"></a>
#### accessKeys.filepath : <code>string</code>
Complete path to `access-keys.json`.
**Kind**: instance property of [<code>AccessKeys</code>](#utils.module_AccessKeys.AccessKeys)
<a name="utils.module_AccessKeys.AccessKeys+check"></a>
#### accessKeys.check(accessKey)
Checks that a given access key is valid and active.
**Kind**: instance method of [<code>AccessKeys</code>](#utils.module_AccessKeys.AccessKeys)
| Param | Type |
| --- | --- |
| accessKey | <code>string</code> |

Wyświetl plik

@ -0,0 +1,73 @@
<a name="utils.module_logCaptureSuccess"></a>
## logCaptureSuccess
archive.social
**Author**: The Harvard Library Innovation Lab
**License**: MIT
* [logCaptureSuccess](#utils.module_logCaptureSuccess)
* [.SuccessLog](#utils.module_logCaptureSuccess.SuccessLog)
* [new exports.SuccessLog()](#new_utils.module_logCaptureSuccess.SuccessLog_new)
* [.filepath](#utils.module_logCaptureSuccess.SuccessLog+filepath) : <code>string</code>
* [.add(accessKey, pdfBytes)](#utils.module_logCaptureSuccess.SuccessLog+add)
* [.findHashInLogs(hash)](#utils.module_logCaptureSuccess.SuccessLog+findHashInLogs) ⇒ <code>boolean</code>
* [.reset()](#utils.module_logCaptureSuccess.SuccessLog+reset) ⇒ <code>void</code>
<a name="utils.module_logCaptureSuccess.SuccessLog"></a>
### logCaptureSuccess.SuccessLog
**Kind**: static class of [<code>logCaptureSuccess</code>](#utils.module_logCaptureSuccess)
* [.SuccessLog](#utils.module_logCaptureSuccess.SuccessLog)
* [new exports.SuccessLog()](#new_utils.module_logCaptureSuccess.SuccessLog_new)
* [.filepath](#utils.module_logCaptureSuccess.SuccessLog+filepath) : <code>string</code>
* [.add(accessKey, pdfBytes)](#utils.module_logCaptureSuccess.SuccessLog+add)
* [.findHashInLogs(hash)](#utils.module_logCaptureSuccess.SuccessLog+findHashInLogs) ⇒ <code>boolean</code>
* [.reset()](#utils.module_logCaptureSuccess.SuccessLog+reset) ⇒ <code>void</code>
<a name="new_utils.module_logCaptureSuccess.SuccessLog_new"></a>
#### new exports.SuccessLog()
On init:
- Create log file if it doesn't exist
- Load hashes from file into `this.#hashes`.
<a name="utils.module_logCaptureSuccess.SuccessLog+filepath"></a>
#### successLog.filepath : <code>string</code>
Complete path to `success-log.json`.
**Kind**: instance property of [<code>SuccessLog</code>](#utils.module_logCaptureSuccess.SuccessLog)
<a name="utils.module_logCaptureSuccess.SuccessLog+add"></a>
#### successLog.add(accessKey, pdfBytes)
Calculates hash of a PDF an:
- Creates a success log entry
- Updates `this.#hashes` (so it doesn't need to reload from file)
**Kind**: instance method of [<code>SuccessLog</code>](#utils.module_logCaptureSuccess.SuccessLog)
| Param | Type | Description |
| --- | --- | --- |
| accessKey | <code>string</code> | |
| pdfBytes | <code>Buffer</code> | Used to store a SHA512 hash of the PDF that was delivered |
<a name="utils.module_logCaptureSuccess.SuccessLog+findHashInLogs"></a>
#### successLog.findHashInLogs(hash) ⇒ <code>boolean</code>
Checks whether or not a given hash is present in the logs.
**Kind**: instance method of [<code>SuccessLog</code>](#utils.module_logCaptureSuccess.SuccessLog)
| Param | Type |
| --- | --- |
| hash | <code>string</code> |
<a name="utils.module_logCaptureSuccess.SuccessLog+reset"></a>
#### successLog.reset() ⇒ <code>void</code>
Resets `success-log.json`.
Also clears `this.#hashes`.
**Kind**: instance method of [<code>SuccessLog</code>](#utils.module_logCaptureSuccess.SuccessLog)

Wyświetl plik

@ -0,0 +1,263 @@
<a name="utils.module_TwitterCapture"></a>
## TwitterCapture
archive.social
**Author**: The Harvard Library Innovation Lab
**License**: MIT
* [TwitterCapture](#utils.module_TwitterCapture)
* [.TwitterCapture](#utils.module_TwitterCapture.TwitterCapture)
* [new exports.TwitterCapture(url, options)](#new_utils.module_TwitterCapture.TwitterCapture_new)
* [.defaults](#utils.module_TwitterCapture.TwitterCapture+defaults)
* [.options](#utils.module_TwitterCapture.TwitterCapture+options) : <code>object</code>
* [.url](#utils.module_TwitterCapture.TwitterCapture+url) : <code>string</code>
* [.urlType](#utils.module_TwitterCapture.TwitterCapture+urlType) : <code>string</code>
* [.playwright](#utils.module_TwitterCapture.TwitterCapture+playwright) : <code>Object</code>
* [.interceptedJPEGs](#utils.module_TwitterCapture.TwitterCapture+interceptedJPEGs) : <code>object.&lt;string, Buffer&gt;</code>
* [.capture](#utils.module_TwitterCapture.TwitterCapture+capture) ⇒ <code>Promise.&lt;Buffer&gt;</code>
* [.setup](#utils.module_TwitterCapture.TwitterCapture+setup) ⇒ <code>Promise.&lt;void&gt;</code>
* [.teardown](#utils.module_TwitterCapture.TwitterCapture+teardown)
* [.adjustUIForCapture](#utils.module_TwitterCapture.TwitterCapture+adjustUIForCapture) ⇒ <code>Promise.&lt;void&gt;</code>
* [.runBrowserBehaviors](#utils.module_TwitterCapture.TwitterCapture+runBrowserBehaviors) ⇒ <code>Promise.&lt;void&gt;</code>
* [.resizeViewportToFitDocument](#utils.module_TwitterCapture.TwitterCapture+resizeViewportToFitDocument) ⇒ <code>Promise.&lt;void&gt;</code>
* [.getDocumentDimensions](#utils.module_TwitterCapture.TwitterCapture+getDocumentDimensions) ⇒ <code>Promise.&lt;{width: number, height: number}&gt;</code>
* [.interceptJpegs](#utils.module_TwitterCapture.TwitterCapture+interceptJpegs) ⇒ <code>Promise.&lt;void&gt;</code>
* [.generateRawPDF](#utils.module_TwitterCapture.TwitterCapture+generateRawPDF) ⇒ <code>Promise.&lt;Buffer&gt;</code>
* [.addInterceptedJPEGsToPDF](#utils.module_TwitterCapture.TwitterCapture+addInterceptedJPEGsToPDF) ⇒ <code>Promise.&lt;void&gt;</code>
* [.captureAndAddVideoToPDF](#utils.module_TwitterCapture.TwitterCapture+captureAndAddVideoToPDF) ⇒ <code>Promise.&lt;void&gt;</code>
* [.cropMarginsOnPDF](#utils.module_TwitterCapture.TwitterCapture+cropMarginsOnPDF)
* [.signPDF](#utils.module_TwitterCapture.TwitterCapture+signPDF) ⇒ <code>Buffer</code>
* [.filterOptions](#utils.module_TwitterCapture.TwitterCapture+filterOptions)
* [.filterUrl](#utils.module_TwitterCapture.TwitterCapture+filterUrl) ⇒ <code>bool</code>
<a name="utils.module_TwitterCapture.TwitterCapture"></a>
### TwitterCapture.TwitterCapture
Generates a "sealed" PDF out of a twitter.com url using Playwright.
Usage:
```
const tweet = new TwitterCapture(url);
const pdf = await tweet.capture();
fs.writeFileSync("tweet.pdf", pdf);
```
**Kind**: static class of [<code>TwitterCapture</code>](#utils.module_TwitterCapture)
* [.TwitterCapture](#utils.module_TwitterCapture.TwitterCapture)
* [new exports.TwitterCapture(url, options)](#new_utils.module_TwitterCapture.TwitterCapture_new)
* [.defaults](#utils.module_TwitterCapture.TwitterCapture+defaults)
* [.options](#utils.module_TwitterCapture.TwitterCapture+options) : <code>object</code>
* [.url](#utils.module_TwitterCapture.TwitterCapture+url) : <code>string</code>
* [.urlType](#utils.module_TwitterCapture.TwitterCapture+urlType) : <code>string</code>
* [.playwright](#utils.module_TwitterCapture.TwitterCapture+playwright) : <code>Object</code>
* [.interceptedJPEGs](#utils.module_TwitterCapture.TwitterCapture+interceptedJPEGs) : <code>object.&lt;string, Buffer&gt;</code>
* [.capture](#utils.module_TwitterCapture.TwitterCapture+capture) ⇒ <code>Promise.&lt;Buffer&gt;</code>
* [.setup](#utils.module_TwitterCapture.TwitterCapture+setup) ⇒ <code>Promise.&lt;void&gt;</code>
* [.teardown](#utils.module_TwitterCapture.TwitterCapture+teardown)
* [.adjustUIForCapture](#utils.module_TwitterCapture.TwitterCapture+adjustUIForCapture) ⇒ <code>Promise.&lt;void&gt;</code>
* [.runBrowserBehaviors](#utils.module_TwitterCapture.TwitterCapture+runBrowserBehaviors) ⇒ <code>Promise.&lt;void&gt;</code>
* [.resizeViewportToFitDocument](#utils.module_TwitterCapture.TwitterCapture+resizeViewportToFitDocument) ⇒ <code>Promise.&lt;void&gt;</code>
* [.getDocumentDimensions](#utils.module_TwitterCapture.TwitterCapture+getDocumentDimensions) ⇒ <code>Promise.&lt;{width: number, height: number}&gt;</code>
* [.interceptJpegs](#utils.module_TwitterCapture.TwitterCapture+interceptJpegs) ⇒ <code>Promise.&lt;void&gt;</code>
* [.generateRawPDF](#utils.module_TwitterCapture.TwitterCapture+generateRawPDF) ⇒ <code>Promise.&lt;Buffer&gt;</code>
* [.addInterceptedJPEGsToPDF](#utils.module_TwitterCapture.TwitterCapture+addInterceptedJPEGsToPDF) ⇒ <code>Promise.&lt;void&gt;</code>
* [.captureAndAddVideoToPDF](#utils.module_TwitterCapture.TwitterCapture+captureAndAddVideoToPDF) ⇒ <code>Promise.&lt;void&gt;</code>
* [.cropMarginsOnPDF](#utils.module_TwitterCapture.TwitterCapture+cropMarginsOnPDF)
* [.signPDF](#utils.module_TwitterCapture.TwitterCapture+signPDF) ⇒ <code>Buffer</code>
* [.filterOptions](#utils.module_TwitterCapture.TwitterCapture+filterOptions)
* [.filterUrl](#utils.module_TwitterCapture.TwitterCapture+filterUrl) ⇒ <code>bool</code>
<a name="new_utils.module_TwitterCapture.TwitterCapture_new"></a>
#### new exports.TwitterCapture(url, options)
| Param | Type | Description |
| --- | --- | --- |
| url | <code>string</code> | `twitter.com` url to capture. Works best on statuses and threads. |
| options | <code>object</code> | See `TwitterCapture.defaults` for detailed options. Will use defaults unless overridden. |
<a name="utils.module_TwitterCapture.TwitterCapture+defaults"></a>
#### twitterCapture.defaults
Defaults for options that can be passed to `TwitterCapture`.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
**Properties**
| Name | Type | Description |
| --- | --- | --- |
| privateKeyPath | <code>string</code> | Path to `.pem` file containing a private key. |
| certPath | <code>string</code> | Path to a `.pem` file containing a certificate. |
| tmpFolderPath | <code>string</code> | Path to a folder in which temporary file can be written. |
| ytDlpPath | <code>string</code> | Path to the `yt-dlp` executable. |
| timestampServerUrl | <code>string</code> | Timestamping server. |
| networkidleTimeout | <code>number</code> | Time to wait for "networkidle" state. |
| runBrowserBehaviors | <code>boolean</code> | If `true`, will try to auto-scroll and open more responses. Set to `false` automatically when trying to capture a profile url. |
| browserBehaviorsTimeout | <code>number</code> | Maximum browser behaviors execution time. |
| videoCaptureTimeout | <code>number</code> | Maximum yt-dlp execution time. |
| renderTimeout | <code>number</code> | Time to wait for re-renders. |
<a name="utils.module_TwitterCapture.TwitterCapture+options"></a>
#### twitterCapture.options : <code>object</code>
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+url"></a>
#### twitterCapture.url : <code>string</code>
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+urlType"></a>
#### twitterCapture.urlType : <code>string</code>
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+playwright"></a>
#### twitterCapture.playwright : <code>Object</code>
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+interceptedJPEGs"></a>
#### twitterCapture.interceptedJPEGs : <code>object.&lt;string, Buffer&gt;</code>
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+capture"></a>
#### twitterCapture.capture ⇒ <code>Promise.&lt;Buffer&gt;</code>
Captures the current Twitter.com url and makes it a signed PDF.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
**Returns**: <code>Promise.&lt;Buffer&gt;</code> - - Signed PDF.
<a name="utils.module_TwitterCapture.TwitterCapture+setup"></a>
#### twitterCapture.setup ⇒ <code>Promise.&lt;void&gt;</code>
Sets up the browser used for capture as well as network interception for images capture.
Populates `this.playwright`.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+teardown"></a>
#### twitterCapture.teardown
Closes browser used for capture.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+adjustUIForCapture"></a>
#### twitterCapture.adjustUIForCapture ⇒ <code>Promise.&lt;void&gt;</code>
Adjusts the current page's DOM so the resulting PDF is not affected by UI artifact.
Playwright needs to be ready.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+runBrowserBehaviors"></a>
#### twitterCapture.runBrowserBehaviors ⇒ <code>Promise.&lt;void&gt;</code>
Runs browser behaviors:
- Tries to scroll through the page.
- Tries to click on the next available "Show replies" button.
Playwright needs to be ready.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+resizeViewportToFitDocument"></a>
#### twitterCapture.resizeViewportToFitDocument ⇒ <code>Promise.&lt;void&gt;</code>
Stretches the viewport to match the document's dimensions.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+getDocumentDimensions"></a>
#### twitterCapture.getDocumentDimensions ⇒ <code>Promise.&lt;{width: number, height: number}&gt;</code>
Returns the current dimensions of the document.
Playwright needs to be ready.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
<a name="utils.module_TwitterCapture.TwitterCapture+interceptJpegs"></a>
#### twitterCapture.interceptJpegs ⇒ <code>Promise.&lt;void&gt;</code>
Uses Playwright's network interception to capture images and add them to `this.interceptedJPEGs`.
Called whenever Playwright processes an HTTP response.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
| Param | Type |
| --- | --- |
| response | <code>playwright.Response</code> |
<a name="utils.module_TwitterCapture.TwitterCapture+generateRawPDF"></a>
#### twitterCapture.generateRawPDF ⇒ <code>Promise.&lt;Buffer&gt;</code>
Generates a PDF of the current page using Chrome Dev Tools.
Playwright needs to be ready.
Populates `this.pdf`.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
**Returns**: <code>Promise.&lt;Buffer&gt;</code> - - PDF Bytes
<a name="utils.module_TwitterCapture.TwitterCapture+addInterceptedJPEGsToPDF"></a>
#### twitterCapture.addInterceptedJPEGsToPDF ⇒ <code>Promise.&lt;void&gt;</code>
Adds entries from `this.interceptedJPEGs`
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
| Type | Description |
| --- | --- |
| <code>PDFDocument</code> | Editable PDF object from `pdf-lib`. |
<a name="utils.module_TwitterCapture.TwitterCapture+captureAndAddVideoToPDF"></a>
#### twitterCapture.captureAndAddVideoToPDF ⇒ <code>Promise.&lt;void&gt;</code>
Tries to capture main video from current Twitter url and add it as attachment to the PDF.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
| Type | Description |
| --- | --- |
| <code>PDFDocument</code> | Editable PDF object from `pdf-lib`. |
<a name="utils.module_TwitterCapture.TwitterCapture+cropMarginsOnPDF"></a>
#### twitterCapture.cropMarginsOnPDF
Tries to remove some of the white space at the bottom of the PDF.
[!] TODO: This is a "let's ship it" hack. We will need to find a better solution.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
| Param | Type |
| --- | --- |
| editablePDF | <code>PDFDocument</code> |
<a name="utils.module_TwitterCapture.TwitterCapture+signPDF"></a>
#### twitterCapture.signPDF ⇒ <code>Buffer</code>
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
**Returns**: <code>Buffer</code> - - PDF Bytes
| Param | Type | Description |
| --- | --- | --- |
| editedPDF | <code>Buffer</code> | PDF Bytes |
<a name="utils.module_TwitterCapture.TwitterCapture+filterOptions"></a>
#### twitterCapture.filterOptions
Applies some basic filtering to new option objects and fills gaps with defaults.
Replaces `this.options` after filtering.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
| Param | Type |
| --- | --- |
| newOptions | <code>Promise.&lt;object&gt;</code> |
<a name="utils.module_TwitterCapture.TwitterCapture+filterUrl"></a>
#### twitterCapture.filterUrl ⇒ <code>bool</code>
Filters a given URL to ensure it's a `twitter.com` one.
Also asserts it's "type": "status", "search", "profile".
Automatically populates `this.url` and `this.urlType`.
**Kind**: instance property of [<code>TwitterCapture</code>](#utils.module_TwitterCapture.TwitterCapture)
| Param | Type |
| --- | --- |
| url | <code>string</code> |

Wyświetl plik

@ -0,0 +1,7 @@
<a name="module_utils"></a>
## utils
archive.social
**Author**: The Harvard Library Innovation Lab
**License**: MIT

1567
package-lock.json wygenerowano

Plik diff jest za duży Load Diff

Wyświetl plik

@ -8,7 +8,8 @@
"start": "fastify start app/server.js -l warn",
"dev": "fastify start app/server.js -l info -w",
"postinstall": "cd scripts && bash download-yt-dlp.sh && bash pip-install.sh",
"generate-local-cert": "cd scripts && bash generate-local-cert.sh",
"generate-dev-cert": "cd scripts && bash generate-dev-cert.sh",
"docgen": "cd scripts && bash docgen.sh",
"test": "echo \"Error: no test specified\" && exit 1"
},
"dependencies": {
@ -40,6 +41,7 @@
"forwarded": "^0.2.0",
"ieee754": "^1.2.1",
"ipaddr.js": "^1.9.1",
"jsdoc-to-markdown": "^7.1.1",
"json-schema-traverse": "^1.0.0",
"light-my-request": "^5.6.1",
"lru-cache": "^6.0.0",

Wyświetl plik

@ -0,0 +1,8 @@
# Generates documentation using JSDoc comments
jsdoc2md ../app/server.js > ../docs/server.md;
jsdoc2md ../app/const.js > ../docs/const.md;
jsdoc2md ../app/utils/index.js > ../docs/utils/index.md;
jsdoc2md ../app/utils/AccessKeys.js > ../docs/utils/AccessKeys.md;
jsdoc2md ../app/utils/SuccessLog.js > ../docs/utils/SuccessLog.md;
jsdoc2md ../app/utils/TwitterCapture.js > ../docs/utils/TwitterCapture.md;

Wyświetl plik

@ -0,0 +1,3 @@
# [DEV ONLY] Generates a local key pair that can be used for signing PDFs.
# Will be saved under ../app/certs.
openssl req -x509 -newkey rsa:4096 -keyout ../certs/key.pem -out ../certs/cert.pem -days 3650 -nodes -subj /CN="archive.social DEV";

Wyświetl plik

@ -1,3 +0,0 @@
# Generates a local key pair that can be used for signing PDFs.
# Will be saved under ../certs.
openssl req -x509 -newkey rsa:4096 -keyout ../certs/key.pem -out ../certs/cert.pem -days 3650 -nodes -subj /CN="archive.social";

Wyświetl plik

@ -1,2 +1 @@
pip3 install "pyHanko[pkcs11,image-support,opentype,xmp]"==0.15.1;
pip3 install pdfCropMargins==1.0.9;