From e3a0003a47e52b32e473573cb475be565a92e7d6 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 27 Jul 2023 21:36:25 +0100
Subject: [PATCH] adding WACZ screenshots

---
 src/auto_archiver/enrichers/wacz_enricher.py | 32 +++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py
index 6962e09..e401de6 100644
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -44,7 +44,6 @@ class WaczArchiverEnricher(Enricher, Archiver):
             return True
 
         url = to_enrich.get_url()
-        logger.warning(f"ENRICHING WACZ for {url=}")
 
         collection = str(uuid.uuid4())[0:8]
         browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
@@ -58,6 +57,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
                 "--scopeType", "page",
                 "--generateWACZ",
                 "--text",
+                "--screenshot", "fullPage",
                 "--collection", collection,
                 "--id", collection,
                 "--saveState", "never",
@@ -80,6 +80,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
                 "--scopeType", "page",
                 "--generateWACZ",
                 "--text",
+                "--screenshot", "fullPage",
                 "--collection", collection,
                 "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
                 "--behaviorTimeout", str(self.timeout),
@@ -136,14 +137,25 @@ class WaczArchiverEnricher(Enricher, Archiver):
 
         # get media out of .warc
         counter = 0
+        seen_urls = set()
         with open(warc_filename, 'rb') as warc_stream:
             for record in ArchiveIterator(warc_stream):
                 # only include fetched resources
+                if record.rec_type == "resource":  # screenshots
+                    fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
+                    with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
+                    m = Media(filename=fn)
+                    to_enrich.add_media(m, "browsertrix-screenshot")
+                    counter += 1
+
                 if record.rec_type != 'response': continue
                 record_url = record.rec_headers.get_header('WARC-Target-URI')
                 if not UrlUtil.is_relevant_url(record_url):
                     logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
                     continue
+                if record_url in seen_urls:
+                    logger.debug(f"Skipping already seen URL {record_url}.")
+                    continue
 
                 # filter by media mimetypes
                 content_type = record.http_headers.get("Content-Type")
@@ -152,11 +164,23 @@ class WaczArchiverEnricher(Enricher, Archiver):
 
                 # create local file and add media
                 ext = mimetypes.guess_extension(content_type)
-                fn = os.path.join(tmp_dir, f"warc-file-{counter}{ext}")
+                warc_fn = f"warc-file-{counter}{ext}"
+                fn = os.path.join(tmp_dir, warc_fn)
+
+                record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
                 with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
+
                 m = Media(filename=fn)
                 m.set("src", record_url)
-                # TODO URLUTIL to ignore known-recurring media like favicons, profile pictures, etc.
-                to_enrich.add_media(m, f"browsertrix-media-{counter}")
+                # if a link with better quality exists, try to download that
+                if record_url_best_qual != record_url:
+                    try:
+                        m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
+                        m.set("src", record_url_best_qual)
+                        m.set("src_alternative", record_url)
+                    except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
+
+                to_enrich.add_media(m, warc_fn)
                 counter += 1
+                seen_urls.add(record_url)
         logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")