more flexibility to antibot dropins media finding process

pull/319/head
msramalho 2025-06-11 16:51:22 +01:00
rodzic d60d02c16e
commit b60469767a
Nie znaleziono w bazie danych klucza dla tego podpisu
3 zmienionych plików z 26 dodań i 13 usunięć

Wyświetl plik

@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
self._enrich_download_media(
sb,
to_enrich,
css_selector=dropin.images_selectors(),
js_css_selector=dropin.js_for_image_css_selectors(),
max_media=self.max_download_images - downloaded_images,
)
self._enrich_download_media(
sb,
to_enrich,
css_selector=dropin.video_selectors(),
js_css_selector=dropin.js_for_video_css_selectors(),
max_media=self.max_download_videos - downloaded_videos,
)
logger.info(f"ANTIBOT completed for {url_sample}")
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
@logger.catch
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
"""
Downloads media from the page and adds them to the Metadata object.
This method is called by the enrich method.
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
url = to_enrich.get_url()
all_urls = set()
sources = sb.execute_script(f"""
return Array.from(document.querySelectorAll("{css_selector}"))
.map(el => el.src || el.href)
.filter(Boolean);
""")
sources = sb.execute_script(js_css_selector)
# js_for_css_selectors
for src in sources:
if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.")

Wyświetl plik

@ -53,6 +53,26 @@ class Dropin:
"""
return "video, source"
def js_for_image_css_selectors(self) -> str:
"""
A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
You can overwrite this instead of `images_selector` for more control over scraped images.
"""
return f"""
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
"""
def js_for_video_css_selectors(self) -> str:
"""
A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
You can overwrite this instead of `video_selector` for more control over scraped videos.
"""
return f"""
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
"""
def open_page(self, url) -> bool:
"""
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@ -66,7 +86,7 @@ class Dropin:
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
:return: A tuple (number of Images added, number of Videos added).
"""
raise NotImplementedError("This method should be implemented in the subclass")
return 0, 0
def _get_username_password(self, site) -> tuple[str, str]:
"""

Wyświetl plik

@ -1,4 +1,3 @@
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@ -13,6 +12,3 @@ class DefaultDropin(Dropin):
def open_page(self, url) -> bool:
return True
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
return 0, 0