kopia lustrzana https://github.com/harvard-lil/archive.social
Attachments issue: workaround (see notes)
I finally found it! The problem comes from the fact that `ghostscript` behaves differently on Mac OS and Linux: on the latter, it strips existing attachments. I was able to confirm this is not just version related, unfortunately, and might be hard to fix. `pdf-crop-margins` is also affected by this issue, since it uses `ghostscript` behind the scenes (at least for what we want it to do). I therefore put together a quickfix / workaround, resizing the PDF manually to try and get most of the trailing whitespace out, and skipping compression. The cropping is much less consistent but! - It's much _much_ faster - I _think_ this should fix attachments on prod - We can revisit when time allows I suspect I will need to make further adjustments to the cropping logic once this has been tested on the production server. This commit also adds an "Unfold thread" checkbox on the homepage form, allowing to skip browser behaviors (which some times gives better results).pull/4/head
rodzic
24759d210d
commit
bb3c37bed4
|
@ -162,7 +162,7 @@ export default async function (fastify, opts) {
|
||||||
CAPTURES_WATCH.currentByAccessKey[accessKey] = 1;
|
CAPTURES_WATCH.currentByAccessKey[accessKey] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const tweets = new TwitterCapture(data.url);
|
const tweets = new TwitterCapture(data.url, {runBrowserBehaviors: "auto-scroll" in data});
|
||||||
const pdf = await tweets.capture();
|
const pdf = await tweets.capture();
|
||||||
|
|
||||||
SuccessLog.add(accessKey, pdf);
|
SuccessLog.add(accessKey, pdf);
|
||||||
|
|
|
@ -173,12 +173,28 @@ body#index > main form fieldset.submit {
|
||||||
text-align: right;
|
text-align: right;
|
||||||
}
|
}
|
||||||
|
|
||||||
body#index > main form fieldset.submit a {
|
body#index > main form fieldset.submit span {
|
||||||
font-size: 0.85rem;
|
}
|
||||||
|
|
||||||
|
body#index > main form fieldset.submit span * {
|
||||||
display: inline-block;
|
display: inline-block;
|
||||||
|
font-size: 0.85rem;
|
||||||
|
width: auto;
|
||||||
padding: 0.5rem;
|
padding: 0.5rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
body#index > main form fieldset.submit span input {
|
||||||
|
padding-right: 0.25rem;
|
||||||
|
margin-top: -1px;
|
||||||
|
vertical-align: middle;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
body#index > main form fieldset.submit span label {
|
||||||
|
cursor: pointer;
|
||||||
|
padding-left: 0.25rem;
|
||||||
|
}
|
||||||
|
|
||||||
/* INDEX - DIALOG */
|
/* INDEX - DIALOG */
|
||||||
body#index > main dialog {
|
body#index > main dialog {
|
||||||
display: none;
|
display: none;
|
||||||
|
|
|
@ -46,7 +46,11 @@
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
<fieldset class="submit">
|
<fieldset class="submit">
|
||||||
<a href="https://docs.google.com/forms/d/11pVfBReAHmHGmtzKrQ4XqrvOMBr4BI4bX-hDdsn0OuQ/viewform" target="_blank" rel="noreferer">Request an access key.</a>
|
<span>
|
||||||
|
<input type="checkbox" name="auto-scroll" id="auto-scroll" checked/>
|
||||||
|
<label for="auto-scroll">Unfold thread</label>
|
||||||
|
</span>
|
||||||
|
|
||||||
<button>Capture</button>
|
<button>Capture</button>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
@ -128,7 +132,7 @@
|
||||||
|
|
||||||
<p>You can download <a href="https://crt.sh/?id=8004113167">this public key file</a> and add it to Adobe Acrobat. We'll also add a page on our site shortly where you can upload a PDF and we'll confirm whether it really came from us.</p>
|
<p>You can download <a href="https://crt.sh/?id=8004113167">this public key file</a> and add it to Adobe Acrobat. We'll also add a page on our site shortly where you can upload a PDF and we'll confirm whether it really came from us.</p>
|
||||||
|
|
||||||
<p>Tech nerd note: As an extra check, the key you're downloading here happens to be one that we also verified via LetsEncrypt as belonging to our Harvard domain, lil.law.harvard.edu. You can see the same key in <a href="https://crt.sh/?id=8004113167">the certificate transparency logs</a>.</p>
|
<p>Tech nerd note: As an extra check, the key you're downloading here happens to be one that we also verified via LetsEncrypt as belonging to our domain. You can see the same key in <a href="https://crt.sh/?id=8004113167">the certificate transparency logs</a>.</p>
|
||||||
|
|
||||||
<h2>Does a signature on a PDF web archive mean it's real?</h2>
|
<h2>Does a signature on a PDF web archive mean it's real?</h2>
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ export class TwitterCapture {
|
||||||
* @property {string} ytDlpPath - Path to the `yt-dlp` executable.
|
* @property {string} ytDlpPath - Path to the `yt-dlp` executable.
|
||||||
* @property {string} timestampServerUrl - Timestamping server.
|
* @property {string} timestampServerUrl - Timestamping server.
|
||||||
* @property {number} networkidleTimeout - Time to wait for "networkidle" state.
|
* @property {number} networkidleTimeout - Time to wait for "networkidle" state.
|
||||||
|
* @property {boolean} runBrowserBehaviors - If `true`, will try to auto-scroll and open more responses. Set to `false` automatically when trying to capture a profile url.
|
||||||
* @property {number} browserBehaviorsTimeout - Maximum browser behaviors execution time.
|
* @property {number} browserBehaviorsTimeout - Maximum browser behaviors execution time.
|
||||||
* @property {number} videoCaptureTimeout - Maximum yt-dlp execution time.
|
* @property {number} videoCaptureTimeout - Maximum yt-dlp execution time.
|
||||||
* @property {number} renderTimeout - Time to wait for re-renders.
|
* @property {number} renderTimeout - Time to wait for re-renders.
|
||||||
|
@ -43,9 +44,10 @@ export class TwitterCapture {
|
||||||
ytDlpPath: `${EXECUTABLES_FOLDER}yt-dlp`,
|
ytDlpPath: `${EXECUTABLES_FOLDER}yt-dlp`,
|
||||||
timestampServerUrl: "http://timestamp.digicert.com",
|
timestampServerUrl: "http://timestamp.digicert.com",
|
||||||
networkidleTimeout: 5000,
|
networkidleTimeout: 5000,
|
||||||
|
runBrowserBehaviors: true,
|
||||||
browserBehaviorsTimeout: 33500,
|
browserBehaviorsTimeout: 33500,
|
||||||
videoCaptureTimeout: 10000,
|
videoCaptureTimeout: 10000,
|
||||||
renderTimeout: 4000
|
renderTimeout: 4000,
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @type {object} - Based on TwitterCapture.defaults */
|
/** @type {object} - Based on TwitterCapture.defaults */
|
||||||
|
@ -62,6 +64,7 @@ export class TwitterCapture {
|
||||||
* browser: ?import('playwright').Browser,
|
* browser: ?import('playwright').Browser,
|
||||||
* context: ?import('playwright').BrowserContext,
|
* context: ?import('playwright').BrowserContext,
|
||||||
* page: ?import('playwright').Page,
|
* page: ?import('playwright').Page,
|
||||||
|
* viewport: ?{width: number, height: number},
|
||||||
* ready: boolean
|
* ready: boolean
|
||||||
* }}
|
* }}
|
||||||
*/
|
*/
|
||||||
|
@ -69,6 +72,7 @@ export class TwitterCapture {
|
||||||
browser: null,
|
browser: null,
|
||||||
context: null,
|
context: null,
|
||||||
page: null,
|
page: null,
|
||||||
|
viewport: null,
|
||||||
ready: false
|
ready: false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -82,6 +86,11 @@ export class TwitterCapture {
|
||||||
constructor(url, options = {}) {
|
constructor(url, options = {}) {
|
||||||
this.filterUrl(url);
|
this.filterUrl(url);
|
||||||
this.filterOptions(options);
|
this.filterOptions(options);
|
||||||
|
|
||||||
|
// Options adjustments:
|
||||||
|
if (this.urlType === "profile") {
|
||||||
|
this.options.runBrowserBehaviors = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -109,11 +118,13 @@ export class TwitterCapture {
|
||||||
await this.adjustUIForCapture();
|
await this.adjustUIForCapture();
|
||||||
|
|
||||||
// Run browser behaviors
|
// Run browser behaviors
|
||||||
if (this.urlType !== "profile") { // Skipped on profile pages
|
if (this.options.runBrowserBehaviors === true) {
|
||||||
await this.runBrowserBehaviors();
|
await this.runBrowserBehaviors();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
new Promise(resolve => setTimeout(resolve, this.options.networkidleTimeout));
|
await new Promise((resolve) =>
|
||||||
|
setTimeout(resolve, this.options.networkidleTimeout + this.options.renderTimeout)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for network idle
|
// Wait for network idle
|
||||||
|
@ -149,15 +160,16 @@ export class TwitterCapture {
|
||||||
//console.log(err);
|
//console.log(err);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try to crop remaining white space
|
||||||
|
await this.cropMarginsOnPDF(editablePDF);
|
||||||
|
|
||||||
// Try to capture video, if any, and add it as attachment
|
// Try to capture video, if any, and add it as attachment
|
||||||
await this.captureAndAddVideoToPDF(editablePDF);
|
await this.captureAndAddVideoToPDF(editablePDF);
|
||||||
|
|
||||||
// Freeze edited PDF in memory
|
// Freeze edited PDF in memory
|
||||||
editedPDF = await editablePDF.save();
|
editedPDF = await editablePDF.save();
|
||||||
|
|
||||||
// Crop and compress
|
fs.writeFileSync("unsigned.pdf", editedPDF)
|
||||||
editedPDF = await this.cropMarginsOnPDF(editedPDF);
|
|
||||||
editedPDF = await this.compressPDF(editedPDF);
|
|
||||||
|
|
||||||
// Sign
|
// Sign
|
||||||
editedPDF = await this.signPDF(editedPDF);
|
editedPDF = await this.signPDF(editedPDF);
|
||||||
|
@ -174,7 +186,7 @@ export class TwitterCapture {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets up the browser used for capture as well as network interception for images capture.
|
* Sets up the browser used for capture as well as network interception for images capture.
|
||||||
* Populates `this.playwright` and `this.playwrightIsReady`;
|
* Populates `this.playwright`.
|
||||||
* @returns {Promise<void>}
|
* @returns {Promise<void>}
|
||||||
*/
|
*/
|
||||||
setup = async() => {
|
setup = async() => {
|
||||||
|
@ -188,6 +200,8 @@ export class TwitterCapture {
|
||||||
this.playwright.context = await this.playwright.browser.newContext({ userAgent });
|
this.playwright.context = await this.playwright.browser.newContext({ userAgent });
|
||||||
this.playwright.page = await this.playwright.context.newPage();
|
this.playwright.page = await this.playwright.context.newPage();
|
||||||
|
|
||||||
|
this.playwright.viewport = viewport;
|
||||||
|
|
||||||
this.playwright.page.setViewportSize(viewport);
|
this.playwright.page.setViewportSize(viewport);
|
||||||
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 500)); // [Debug]
|
await new Promise(resolve => setTimeout(resolve, 500)); // [Debug]
|
||||||
|
@ -495,65 +509,26 @@ export class TwitterCapture {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param {Buffer} editedPDF - PDF Bytes
|
* Tries to remove some of the white space at the bottom of the PDF.
|
||||||
* @returns {Buffer} - PDF Bytes
|
* [!] TODO: This is a "let's ship it" hack. We will need to find a better solution.
|
||||||
|
* @param {PDFDocument} editablePDF
|
||||||
*/
|
*/
|
||||||
cropMarginsOnPDF = async(editedPDF) => {
|
cropMarginsOnPDF = async(editablePDF) => {
|
||||||
// Save PDF to disk
|
const page = editablePDF.getPage(0);
|
||||||
const id = uuidv4();
|
const originalHeight = page.getHeight();
|
||||||
const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
|
|
||||||
const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
|
|
||||||
fs.writeFileSync(filepathIn, editedPDF);
|
|
||||||
|
|
||||||
// Apply cropping
|
// Only crop if content > viewport
|
||||||
const run = spawnSync(
|
if (this.playwright.viewport.height > originalHeight) {
|
||||||
"pdf-crop-margins",
|
return;
|
||||||
["-p", "0", "-a", "-20", "-o", filepathOut, filepathIn],
|
|
||||||
{ encoding: "utf-8" }
|
|
||||||
);
|
|
||||||
|
|
||||||
if (run.status !== 0) {
|
|
||||||
throw new Error(run.stderr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load cropped file from disk and return
|
|
||||||
editedPDF = fs.readFileSync(filepathOut);
|
|
||||||
fs.unlink(filepathIn, () => {});
|
|
||||||
fs.unlink(filepathOut, () => {});
|
|
||||||
|
|
||||||
return editedPDF;
|
const reductionFactor = this.options.runBrowserBehaviors ? 44 : 88;
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
const newHeight = Math.floor(originalHeight - (originalHeight / 100 * reductionFactor));
|
||||||
* @param {Buffer} editedPDF - PDF Bytes
|
const yShift = originalHeight - newHeight;
|
||||||
* @returns {Buffer} - PDF Bytes
|
|
||||||
*/
|
|
||||||
compressPDF = async(editedPDF) => {
|
|
||||||
// Save PDF to disk
|
|
||||||
const id = uuidv4();
|
|
||||||
const filepathIn = `${this.options.tmpFolderPath}${id}-in.pdf`;
|
|
||||||
const filepathOut = `${this.options.tmpFolderPath}${id}-out.pdf`;
|
|
||||||
fs.writeFileSync(filepathIn, editedPDF);
|
|
||||||
|
|
||||||
const run = spawnSync("gs", [
|
page.setSize(page.getWidth(), newHeight);
|
||||||
"-sDEVICE=pdfwrite",
|
page.translateContent(0, -yShift);
|
||||||
"-dNOPAUSE",
|
|
||||||
"-dBATCH",
|
|
||||||
"-dJPEGQ=90",
|
|
||||||
"-r150",
|
|
||||||
`-sOutputFile=${filepathOut}`,
|
|
||||||
`${filepathIn}`,
|
|
||||||
]);
|
|
||||||
|
|
||||||
if (run.status !== 0) {
|
|
||||||
throw new Error(run.stderr);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load compressed file from disk and return
|
|
||||||
editedPDF = fs.readFileSync(filepathOut);
|
|
||||||
fs.unlink(filepathIn, () => {});
|
|
||||||
fs.unlink(filepathOut, () => {});
|
|
||||||
return editedPDF;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Ładowanie…
Reference in New Issue