kopia lustrzana https://github.com/bellingcat/auto-archiver
new auth wall check logic and escaped CSS selector in selenium
rodzic
7234eda85f
commit
37c6d97275
|
@ -99,7 +99,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
dropin = self._get_suitable_dropin(url, sb)
|
||||
dropin.open_page(url)
|
||||
|
||||
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||
logger.warning("Skipping since auth wall or CAPTCHA was detected")
|
||||
return False
|
||||
|
||||
|
@ -277,8 +277,14 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
return
|
||||
url = to_enrich.get_url()
|
||||
all_urls = set()
|
||||
logger.debug(f"Extracting media for {js_css_selector=}")
|
||||
|
||||
try:
|
||||
sources = sb.execute_script(js_css_selector)
|
||||
except selenium.common.exceptions.JavascriptException as e:
|
||||
logger.error(f"Error executing JavaScript selector {js_css_selector}: {e}")
|
||||
return
|
||||
|
||||
sources = sb.execute_script(js_css_selector)
|
||||
# js_for_css_selectors
|
||||
for src in sources:
|
||||
if len(all_urls) >= max_media:
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import json
|
||||
import os
|
||||
import traceback
|
||||
from typing import Mapping
|
||||
|
@ -74,8 +75,11 @@ class Dropin:
|
|||
|
||||
You can overwrite this instead of `images_selector` for more control over scraped images.
|
||||
"""
|
||||
if not self.images_selectors():
|
||||
return "return [];"
|
||||
safe_selector = json.dumps(self.images_selectors())
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def js_for_video_css_selectors(self) -> str:
|
||||
|
@ -84,8 +88,11 @@ class Dropin:
|
|||
|
||||
You can overwrite this instead of `video_selector` for more control over scraped videos.
|
||||
"""
|
||||
if not self.video_selectors():
|
||||
return "return [];"
|
||||
safe_selector = json.dumps(self.video_selectors())
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
|
@ -103,6 +110,12 @@ class Dropin:
|
|||
"""
|
||||
return 0, 0
|
||||
|
||||
def hit_auth_wall(self) -> bool:
|
||||
"""
|
||||
Custom check to see if the current page is behind an authentication wall, if True is returned the default global auth wall detector is used instead. If false, no auth wall is detected and the page is considered open.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _get_username_password(self, site) -> tuple[str, str]:
|
||||
"""
|
||||
Get the username and password for the site from the extractor's auth data.
|
||||
|
|
Ładowanie…
Reference in New Issue