more flexibility to antibot dropins media finding process

2025-06-11 16:51:22 +01:00 · 2025-06-11 16:51:22 +01:00 · b60469767a
commit b60469767a
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
                self._enrich_download_media(
                    sb,
                    to_enrich,
-                    css_selector=dropin.images_selectors(),
+                    js_css_selector=dropin.js_for_image_css_selectors(),
                    max_media=self.max_download_images - downloaded_images,
                )
                self._enrich_download_media(
                    sb,
                    to_enrich,
-                    css_selector=dropin.video_selectors(),
+                    js_css_selector=dropin.js_for_video_css_selectors(),
                    max_media=self.max_download_videos - downloaded_videos,
                )
                logger.info(f"ANTIBOT completed for {url_sample}")
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
        to_enrich.add_media(Media(filename=pdf_filename), id="pdf")

    @logger.catch
-    def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
+    def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
        """
        Downloads media from the page and adds them to the Metadata object.
        This method is called by the enrich method.
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
        url = to_enrich.get_url()
        all_urls = set()

-        sources = sb.execute_script(f"""
-            return Array.from(document.querySelectorAll("{css_selector}"))
-                    .map(el => el.src || el.href)
-                    .filter(Boolean);
-        """)
+        sources = sb.execute_script(js_css_selector)
+        # js_for_css_selectors
        for src in sources:
            if len(all_urls) >= max_media:
                logger.debug(f"Reached max download limit of {max_media} images/videos.")
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@ -53,6 +53,26 @@ class Dropin:
        """
        return "video, source"

+    def js_for_image_css_selectors(self) -> str:
+        """
+        A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
+
+        You can overwrite this instead of `images_selector` for more control over scraped images.
+        """
+        return f"""
+            return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+        """
+
+    def js_for_video_css_selectors(self) -> str:
+        """
+        A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
+
+        You can overwrite this instead of `video_selector` for more control over scraped videos.
+        """
+        return f"""
+            return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+        """
+
    def open_page(self, url) -> bool:
        """
        Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@ -66,7 +86,7 @@ class Dropin:
        Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
        :return: A tuple (number of Images added, number of Videos added).
        """
-        raise NotImplementedError("This method should be implemented in the subclass")
+        return 0, 0

    def _get_username_password(self, site) -> tuple[str, str]:
        """
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py
@ -1,4 +1,3 @@
-from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin


@ -13,6 +12,3 @@ class DefaultDropin(Dropin):

    def open_page(self, url) -> bool:
        return True
-
-    def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
-        return 0, 0