diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 5e61bad..d1a4ee5 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -99,7 +99,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): dropin = self._get_suitable_dropin(url, sb) dropin.open_page(url) - if self.detect_auth_wall and self._hit_auth_wall(sb): + if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)): logger.warning("Skipping since auth wall or CAPTCHA was detected") return False @@ -277,8 +277,14 @@ class AntibotExtractorEnricher(Extractor, Enricher): return url = to_enrich.get_url() all_urls = set() + logger.debug(f"Extracting media for {js_css_selector=}") + + try: + sources = sb.execute_script(js_css_selector) + except selenium.common.exceptions.JavascriptException as e: + logger.error(f"Error executing JavaScript selector {js_css_selector}: {e}") + return - sources = sb.execute_script(js_css_selector) # js_for_css_selectors for src in sources: if len(all_urls) >= max_media: diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 47c958a..b2539b1 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,3 +1,4 @@ +import json import os import traceback from typing import Mapping @@ -74,8 +75,11 @@ class Dropin: You can overwrite this instead of `images_selector` for more control over scraped images. """ + if not self.images_selectors(): + return "return [];" + safe_selector = json.dumps(self.images_selectors()) return f""" - return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean); + return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean); """ def js_for_video_css_selectors(self) -> str: @@ -84,8 +88,11 @@ class Dropin: You can overwrite this instead of `video_selector` for more control over scraped videos. """ + if not self.video_selectors(): + return "return [];" + safe_selector = json.dumps(self.video_selectors()) return f""" - return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean); + return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean); """ def open_page(self, url) -> bool: @@ -103,6 +110,12 @@ class Dropin: """ return 0, 0 + def hit_auth_wall(self) -> bool: + """ + Custom check to see if the current page is behind an authentication wall, if True is returned the default global auth wall detector is used instead. If false, no auth wall is detected and the page is considered open. + """ + return True + def _get_username_password(self, site) -> tuple[str, str]: """ Get the username and password for the site from the extractor's auth data.