kopia lustrzana https://github.com/bellingcat/auto-archiver
adding WACZ screenshots
rodzic
59551b3b20
commit
e3a0003a47
|
@ -44,7 +44,6 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.warning(f"ENRICHING WACZ for {url=}")
|
|
||||||
|
|
||||||
collection = str(uuid.uuid4())[0:8]
|
collection = str(uuid.uuid4())[0:8]
|
||||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||||
|
@ -58,6 +57,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
"--scopeType", "page",
|
"--scopeType", "page",
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text",
|
"--text",
|
||||||
|
"--screenshot", "fullPage",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
"--id", collection,
|
"--id", collection,
|
||||||
"--saveState", "never",
|
"--saveState", "never",
|
||||||
|
@ -80,6 +80,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
"--scopeType", "page",
|
"--scopeType", "page",
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text",
|
"--text",
|
||||||
|
"--screenshot", "fullPage",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
"--behaviorTimeout", str(self.timeout),
|
"--behaviorTimeout", str(self.timeout),
|
||||||
|
@ -136,14 +137,25 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
|
|
||||||
# get media out of .warc
|
# get media out of .warc
|
||||||
counter = 0
|
counter = 0
|
||||||
|
seen_urls = set()
|
||||||
with open(warc_filename, 'rb') as warc_stream:
|
with open(warc_filename, 'rb') as warc_stream:
|
||||||
for record in ArchiveIterator(warc_stream):
|
for record in ArchiveIterator(warc_stream):
|
||||||
# only include fetched resources
|
# only include fetched resources
|
||||||
|
if record.rec_type == "resource": # screenshots
|
||||||
|
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
||||||
|
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
||||||
|
m = Media(filename=fn)
|
||||||
|
to_enrich.add_media(m, "browsertrix-screenshot")
|
||||||
|
counter += 1
|
||||||
|
|
||||||
if record.rec_type != 'response': continue
|
if record.rec_type != 'response': continue
|
||||||
record_url = record.rec_headers.get_header('WARC-Target-URI')
|
record_url = record.rec_headers.get_header('WARC-Target-URI')
|
||||||
if not UrlUtil.is_relevant_url(record_url):
|
if not UrlUtil.is_relevant_url(record_url):
|
||||||
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
|
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
|
||||||
continue
|
continue
|
||||||
|
if record_url in seen_urls:
|
||||||
|
logger.debug(f"Skipping already seen URL {record_url}.")
|
||||||
|
continue
|
||||||
|
|
||||||
# filter by media mimetypes
|
# filter by media mimetypes
|
||||||
content_type = record.http_headers.get("Content-Type")
|
content_type = record.http_headers.get("Content-Type")
|
||||||
|
@ -152,11 +164,23 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||||
|
|
||||||
# create local file and add media
|
# create local file and add media
|
||||||
ext = mimetypes.guess_extension(content_type)
|
ext = mimetypes.guess_extension(content_type)
|
||||||
fn = os.path.join(tmp_dir, f"warc-file-{counter}{ext}")
|
warc_fn = f"warc-file-{counter}{ext}"
|
||||||
|
fn = os.path.join(tmp_dir, warc_fn)
|
||||||
|
|
||||||
|
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
||||||
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
|
||||||
|
|
||||||
m = Media(filename=fn)
|
m = Media(filename=fn)
|
||||||
m.set("src", record_url)
|
m.set("src", record_url)
|
||||||
# TODO URLUTIL to ignore known-recurring media like favicons, profile pictures, etc.
|
# if a link with better quality exists, try to download that
|
||||||
to_enrich.add_media(m, f"browsertrix-media-{counter}")
|
if record_url_best_qual != record_url:
|
||||||
|
try:
|
||||||
|
m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
|
||||||
|
m.set("src", record_url_best_qual)
|
||||||
|
m.set("src_alternative", record_url)
|
||||||
|
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
|
||||||
|
|
||||||
|
to_enrich.add_media(m, warc_fn)
|
||||||
counter += 1
|
counter += 1
|
||||||
|
seen_urls.add(record_url)
|
||||||
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
|
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
|
||||||
|
|
Ładowanie…
Reference in New Issue