Attachments issue: workaround (see notes)

I finally found it!

The problem comes from the fact that `ghostscript` behaves differently on Mac OS and Linux: on the latter, it strips existing attachments.

I was able to confirm this is not just version related, unfortunately, and might be hard to fix.

`pdf-crop-margins` is also affected by this issue, since it uses `ghostscript` behind the scenes (at least for what we want it to do).

I therefore put together a quickfix / workaround, resizing the PDF manually to try and get most of the trailing whitespace out, and skipping compression.

The cropping is much less consistent but!
- It's much _much_ faster
- I _think_ this should fix attachments on prod
- We can revisit when time allows

I suspect I will need to make further adjustments to the cropping logic once this has been tested on the production server.

This commit also adds an "Unfold thread" checkbox on the homepage form, allowing to skip browser behaviors (which some times gives better results).
pull/4/head
Matteo Cargnelutti 2022-11-19 02:27:11 -05:00
rodzic 24759d210d
commit bb3c37bed4
4 zmienionych plików z 60 dodań i 65 usunięć

Wyświetl plik

@ -162,7 +162,7 @@ export default async function (fastify, opts) {
CAPTURES_WATCH.currentByAccessKey[accessKey] = 1; CAPTURES_WATCH.currentByAccessKey[accessKey] = 1;
} }
const tweets = new TwitterCapture(data.url); const tweets = new TwitterCapture(data.url, {runBrowserBehaviors: "auto-scroll" in data});
const pdf = await tweets.capture(); const pdf = await tweets.capture();
SuccessLog.add(accessKey, pdf); SuccessLog.add(accessKey, pdf);

Wyświetl plik

@ -173,12 +173,28 @@ body#index > main form fieldset.submit {
text-align: right; text-align: right;
} }
body#index > main form fieldset.submit a { body#index > main form fieldset.submit span {
font-size: 0.85rem; }
body#index > main form fieldset.submit span * {
display: inline-block; display: inline-block;
font-size: 0.85rem;
width: auto;
padding: 0.5rem; padding: 0.5rem;
} }
body#index > main form fieldset.submit span input {
padding-right: 0.25rem;
margin-top: -1px;
vertical-align: middle;
cursor: pointer;
}
body#index > main form fieldset.submit span label {
cursor: pointer;
padding-left: 0.25rem;
}
/* INDEX - DIALOG */ /* INDEX - DIALOG */
body#index > main dialog { body#index > main dialog {
display: none; display: none;

Wyświetl plik

@ -46,7 +46,11 @@
</fieldset> </fieldset>
<fieldset class="submit"> <fieldset class="submit">
<a href="https://docs.google.com/forms/d/11pVfBReAHmHGmtzKrQ4XqrvOMBr4BI4bX-hDdsn0OuQ/viewform" target="_blank" rel="noreferer">Request an access key.</a> <span>
<input type="checkbox" name="auto-scroll" id="auto-scroll" checked/>
<label for="auto-scroll">Unfold thread</label>
</span>
<button>Capture</button> <button>Capture</button>
</fieldset> </fieldset>
@ -128,7 +132,7 @@
<p>You can download <a href="https://crt.sh/?id=8004113167">this public key file</a> and add it to Adobe Acrobat. We'll also add a page on our site shortly where you can upload a PDF and we'll confirm whether it really came from us.</p> <p>You can download <a href="https://crt.sh/?id=8004113167">this public key file</a> and add it to Adobe Acrobat. We'll also add a page on our site shortly where you can upload a PDF and we'll confirm whether it really came from us.</p>
<p>Tech nerd note: As an extra check, the key you're downloading here happens to be one that we also verified via LetsEncrypt as belonging to our Harvard domain, lil.law.harvard.edu. You can see the same key in <a href="https://crt.sh/?id=8004113167">the certificate transparency logs</a>.</p> <p>Tech nerd note: As an extra check, the key you're downloading here happens to be one that we also verified via LetsEncrypt as belonging to our domain. You can see the same key in <a href="https://crt.sh/?id=8004113167">the certificate transparency logs</a>.</p>
<h2>Does a signature on a PDF web archive mean it's real?</h2> <h2>Does a signature on a PDF web archive mean it's real?</h2>

Wyświetl plik

@ -32,6 +32,7 @@ export class TwitterCapture {
* @property {string} ytDlpPath - Path to the `yt-dlp` executable. * @property {string} ytDlpPath - Path to the `yt-dlp` executable.
* @property {string} timestampServerUrl - Timestamping server. * @property {string} timestampServerUrl - Timestamping server.
* @property {number} networkidleTimeout - Time to wait for "networkidle" state. * @property {number} networkidleTimeout - Time to wait for "networkidle" state.
* @property {boolean} runBrowserBehaviors - If `true`, will try to auto-scroll and open more responses. Set to `false` automatically when trying to capture a profile url.
* @property {number} browserBehaviorsTimeout - Maximum browser behaviors execution time. * @property {number} browserBehaviorsTimeout - Maximum browser behaviors execution time.
* @property {number} videoCaptureTimeout - Maximum yt-dlp execution time. * @property {number} videoCaptureTimeout - Maximum yt-dlp execution time.
* @property {number} renderTimeout - Time to wait for re-renders. * @property {number} renderTimeout - Time to wait for re-renders.
@ -43,9 +44,10 @@ export class TwitterCapture {
ytDlpPath: `${EXECUTABLES_FOLDER}yt-dlp`, ytDlpPath: `${EXECUTABLES_FOLDER}yt-dlp`,
timestampServerUrl: "http://timestamp.digicert.com", timestampServerUrl: "http://timestamp.digicert.com",
networkidleTimeout: 5000, networkidleTimeout: 5000,
runBrowserBehaviors: true,
browserBehaviorsTimeout: 33500, browserBehaviorsTimeout: 33500,
videoCaptureTimeout: 10000, videoCaptureTimeout: 10000,
renderTimeout: 4000 renderTimeout: 4000,
}; };
/** @type {object} - Based on TwitterCapture.defaults */ /** @type {object} - Based on TwitterCapture.defaults */
@ -62,6 +64,7 @@ export class TwitterCapture {
* browser: ?import('playwright').Browser, * browser: ?import('playwright').Browser,
* context: ?import('playwright').BrowserContext, * context: ?import('playwright').BrowserContext,
* page: ?import('playwright').Page, * page: ?import('playwright').Page,
* viewport: ?{width: number, height: number},
* ready: boolean * ready: boolean
* }} * }}
*/ */
@ -69,6 +72,7 @@ export class TwitterCapture {
browser: null, browser: null,
context: null, context: null,
page: null, page: null,
viewport: null,
ready: false ready: false
}; };
@ -82,6 +86,11 @@ export class TwitterCapture {
constructor(url, options = {}) { constructor(url, options = {}) {
this.filterUrl(url); this.filterUrl(url);
this.filterOptions(options); this.filterOptions(options);
// Options adjustments:
if (this.urlType === "profile") {
this.options.runBrowserBehaviors = false;
}
} }
/** /**
@ -109,11 +118,13 @@ export class TwitterCapture {
await this.adjustUIForCapture(); await this.adjustUIForCapture();
// Run browser behaviors // Run browser behaviors
if (this.urlType !== "profile") { // Skipped on profile pages if (this.options.runBrowserBehaviors === true) {
await this.runBrowserBehaviors(); await this.runBrowserBehaviors();
} }
else { else {
new Promise(resolve => setTimeout(resolve, this.options.networkidleTimeout)); await new Promise((resolve) =>
setTimeout(resolve, this.options.networkidleTimeout + this.options.renderTimeout)
);
} }
// Wait for network idle // Wait for network idle
@ -149,15 +160,16 @@ export class TwitterCapture {
//console.log(err); //console.log(err);
} }
// Try to crop remaining white space
await this.cropMarginsOnPDF(editablePDF);
// Try to capture video, if any, and add it as attachment // Try to capture video, if any, and add it as attachment
await this.captureAndAddVideoToPDF(editablePDF); await this.captureAndAddVideoToPDF(editablePDF);
// Freeze edited PDF in memory // Freeze edited PDF in memory
editedPDF = await editablePDF.save(); editedPDF = await editablePDF.save();
// Crop and compress fs.writeFileSync("unsigned.pdf", editedPDF)
editedPDF = await this.cropMarginsOnPDF(editedPDF);
editedPDF = await this.compressPDF(editedPDF);
// Sign // Sign
editedPDF = await this.signPDF(editedPDF); editedPDF = await this.signPDF(editedPDF);
@ -174,7 +186,7 @@ export class TwitterCapture {
/** /**
* Sets up the browser used for capture as well as network interception for images capture. * Sets up the browser used for capture as well as network interception for images capture.
* Populates `this.playwright` and `this.playwrightIsReady`; * Populates `this.playwright`.
* @returns {Promise<void>} * @returns {Promise<void>}
*/ */
setup = async() => { setup = async() => {
@ -188,6 +200,8 @@ export class TwitterCapture {
this.playwright.context = await this.playwright.browser.newContext({ userAgent }); this.playwright.context = await this.playwright.browser.newContext({ userAgent });
this.playwright.page = await this.playwright.context.newPage(); this.playwright.page = await this.playwright.context.newPage();
this.playwright.viewport = viewport;
this.playwright.page.setViewportSize(viewport); this.playwright.page.setViewportSize(viewport);
await new Promise(resolve => setTimeout(resolve, 500)); // [Debug] await new Promise(resolve => setTimeout(resolve, 500)); // [Debug]
@ -495,65 +509,26 @@ export class TwitterCapture {
} }
/** /**
* @param {Buffer} editedPDF - PDF Bytes * Tries to remove some of the white space at the bottom of the PDF.
* @returns {Buffer} - PDF Bytes * [!] TODO: This is a "let's ship it" hack. We will need to find a better solution.
* @param {PDFDocument} editablePDF
*/ */
cropMarginsOnPDF = async(editedPDF) => { cropMarginsOnPDF = async(editablePDF) => {
// Save PDF to disk const page = editablePDF.getPage(0);
const id = uuidv4(); const originalHeight = page.getHeight();
const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
fs.writeFileSync(filepathIn, editedPDF);
// Apply cropping // Only crop if content > viewport
const run = spawnSync( if (this.playwright.viewport.height > originalHeight) {
"pdf-crop-margins", return;
["-p", "0", "-a", "-20", "-o", filepathOut, filepathIn],
{ encoding: "utf-8" }
);
if (run.status !== 0) {
throw new Error(run.stderr);
} }
// Load cropped file from disk and return
editedPDF = fs.readFileSync(filepathOut);
fs.unlink(filepathIn, () => {});
fs.unlink(filepathOut, () => {});
return editedPDF; const reductionFactor = this.options.runBrowserBehaviors ? 44 : 88;
}
/** const newHeight = Math.floor(originalHeight - (originalHeight / 100 * reductionFactor));
* @param {Buffer} editedPDF - PDF Bytes const yShift = originalHeight - newHeight;
* @returns {Buffer} - PDF Bytes
*/
compressPDF = async(editedPDF) => {
// Save PDF to disk
const id = uuidv4();
const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
fs.writeFileSync(filepathIn, editedPDF);
const run = spawnSync("gs", [ page.setSize(page.getWidth(), newHeight);
"-sDEVICE=pdfwrite", page.translateContent(0, -yShift);
"-dNOPAUSE",
"-dBATCH",
"-dJPEGQ=90",
"-r150",
`-sOutputFile=${filepathOut}`,
`${filepathIn}`,
]);
if (run.status !== 0) {
throw new Error(run.stderr);
}
// Load compressed file from disk and return
editedPDF = fs.readFileSync(filepathOut);
fs.unlink(filepathIn, () => {});
fs.unlink(filepathOut, () => {});
return editedPDF;
} }
/** /**