new auth wall check logic and escaped CSS selector in selenium

2025-07-05 18:30:31 +01:00 · 2025-07-05 18:30:31 +01:00 · 37c6d97275
commit 37c6d97275
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@ -99,7 +99,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
                dropin = self._get_suitable_dropin(url, sb)
                dropin.open_page(url)

-                if self.detect_auth_wall and self._hit_auth_wall(sb):
+                if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
                    logger.warning("Skipping since auth wall or CAPTCHA was detected")
                    return False

@ -277,8 +277,14 @@ class AntibotExtractorEnricher(Extractor, Enricher):
            return
        url = to_enrich.get_url()
        all_urls = set()
+        logger.debug(f"Extracting media for {js_css_selector=}")
+
+        try:
+            sources = sb.execute_script(js_css_selector)
+        except selenium.common.exceptions.JavascriptException as e:
+            logger.error(f"Error executing JavaScript selector {js_css_selector}: {e}")
+            return

-        sources = sb.execute_script(js_css_selector)
        # js_for_css_selectors
        for src in sources:
            if len(all_urls) >= max_media:
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@ -1,3 +1,4 @@
+import json
 import os
 import traceback
 from typing import Mapping
@ -74,8 +75,11 @@ class Dropin:

        You can overwrite this instead of `images_selector` for more control over scraped images.
        """
+        if not self.images_selectors():
+            return "return [];"
+        safe_selector = json.dumps(self.images_selectors())
        return f"""
-            return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+            return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
        """

    def js_for_video_css_selectors(self) -> str:
@ -84,8 +88,11 @@ class Dropin:

        You can overwrite this instead of `video_selector` for more control over scraped videos.
        """
+        if not self.video_selectors():
+            return "return [];"
+        safe_selector = json.dumps(self.video_selectors())
        return f"""
-            return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+            return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
        """

    def open_page(self, url) -> bool:
@ -103,6 +110,12 @@ class Dropin:
        """
        return 0, 0

+    def hit_auth_wall(self) -> bool:
+        """
+        Custom check to see if the current page is behind an authentication wall, if True is returned the default global auth wall detector is used instead. If false, no auth wall is detected and the page is considered open.
+        """
+        return True
+
    def _get_username_password(self, site) -> tuple[str, str]:
        """
        Get the username and password for the site from the extractor's auth data.