browsertrix docker updates

2024-04-15 19:01:55 +01:00 · 2024-04-15 19:01:55 +01:00 · 9c7824de57
commit 9c7824de57
--- a/7
+++ b/7
@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:latest
+FROM webrecorder/browsertrix-crawler:1.0.4

 ENV RUNNING_IN_DOCKER=1

@ -19,9 +19,8 @@ RUN pip install --upgrade pip && \

 COPY Pipfile* ./
 # install from pipenv, with browsertrix-only requirements
-RUN pipenv install && \
-	pipenv install pywb uwsgi
-	
+RUN pipenv install
+
 # doing this at the end helps during development, builds are quick
 COPY ./src/ . 

--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@ -75,14 +75,16 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "--url", url,
            "--scopeType", "page",
            "--generateWACZ",
-            "--text",
+            "--text", "to-pages",
            "--screenshot", "fullPage",
            "--collection", collection,
            "--id", collection,
            "--saveState", "never",
            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
            "--behaviorTimeout", str(self.timeout),
-            "--timeout", str(self.timeout)]
+            "--timeout", str(self.timeout),
+            "--blockAds" # TODO: test
+        ]
        
        if self.docker_in_docker:
            cmd.extend(["--cwd", self.cwd_dind])
@ -110,9 +112,9 @@ class WaczArchiverEnricher(Enricher, Archiver):

        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
+            my_env = os.environ.copy()
            if self.socks_proxy_host and self.socks_proxy_port:
                logger.debug("Using SOCKS proxy for browsertrix-crawler")
-                my_env = os.environ.copy()
                my_env["SOCKS_HOST"] = self.socks_proxy_host
                my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
            subprocess.run(cmd, check=True, env=my_env)
@ -161,7 +163,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
        """
        Receives a .wacz archive, and extracts all relevant media from it, adding them to to_enrich.
        """
-        logger.info(f"WACZ extract_media flag is set, extracting media from {wacz_filename=}")
+        logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")

        # unzipping the .wacz
        tmp_dir = ArchivingContext.get_tmp_dir()
@ -182,10 +184,11 @@ class WaczArchiverEnricher(Enricher, Archiver):
        # get media out of .warc
        counter = 0
        seen_urls = set()
+        import json
        with open(warc_filename, 'rb') as warc_stream:
            for record in ArchiveIterator(warc_stream):
                # only include fetched resources
-                if record.rec_type == "resource" and self.extract_screenshot:  # screenshots
+                if record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot:  # screenshots
                    fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
                    with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
                    m = Media(filename=fn)
@ -231,4 +234,4 @@ class WaczArchiverEnricher(Enricher, Archiver):
                to_enrich.add_media(m, warc_fn)
                counter += 1
                seen_urls.add(record_url)
-        logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
+        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")