more flexibility to antibot dropins media finding process

pull/319/head
msramalho 2025-06-11 16:51:22 +01:00
rodzic d60d02c16e
commit b60469767a
Nie znaleziono w bazie danych klucza dla tego podpisu
3 zmienionych plików z 26 dodań i 13 usunięć

Wyświetl plik

@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
self._enrich_download_media( self._enrich_download_media(
sb, sb,
to_enrich, to_enrich,
css_selector=dropin.images_selectors(), js_css_selector=dropin.js_for_image_css_selectors(),
max_media=self.max_download_images - downloaded_images, max_media=self.max_download_images - downloaded_images,
) )
self._enrich_download_media( self._enrich_download_media(
sb, sb,
to_enrich, to_enrich,
css_selector=dropin.video_selectors(), js_css_selector=dropin.js_for_video_css_selectors(),
max_media=self.max_download_videos - downloaded_videos, max_media=self.max_download_videos - downloaded_videos,
) )
logger.info(f"ANTIBOT completed for {url_sample}") logger.info(f"ANTIBOT completed for {url_sample}")
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
to_enrich.add_media(Media(filename=pdf_filename), id="pdf") to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
@logger.catch @logger.catch
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int): def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
""" """
Downloads media from the page and adds them to the Metadata object. Downloads media from the page and adds them to the Metadata object.
This method is called by the enrich method. This method is called by the enrich method.
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
url = to_enrich.get_url() url = to_enrich.get_url()
all_urls = set() all_urls = set()
sources = sb.execute_script(f""" sources = sb.execute_script(js_css_selector)
return Array.from(document.querySelectorAll("{css_selector}")) # js_for_css_selectors
.map(el => el.src || el.href)
.filter(Boolean);
""")
for src in sources: for src in sources:
if len(all_urls) >= max_media: if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.") logger.debug(f"Reached max download limit of {max_media} images/videos.")

Wyświetl plik

@ -53,6 +53,26 @@ class Dropin:
""" """
return "video, source" return "video, source"
def js_for_image_css_selectors(self) -> str:
"""
A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
You can overwrite this instead of `images_selector` for more control over scraped images.
"""
return f"""
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
"""
def js_for_video_css_selectors(self) -> str:
"""
A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
You can overwrite this instead of `video_selector` for more control over scraped videos.
"""
return f"""
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
"""
def open_page(self, url) -> bool: def open_page(self, url) -> bool:
""" """
Make sure the page is opened, even if it requires authentication, captcha solving, etc. Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@ -66,7 +86,7 @@ class Dropin:
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object. Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
:return: A tuple (number of Images added, number of Videos added). :return: A tuple (number of Images added, number of Videos added).
""" """
raise NotImplementedError("This method should be implemented in the subclass") return 0, 0
def _get_username_password(self, site) -> tuple[str, str]: def _get_username_password(self, site) -> tuple[str, str]:
""" """

Wyświetl plik

@ -1,4 +1,3 @@
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@ -13,6 +12,3 @@ class DefaultDropin(Dropin):
def open_page(self, url) -> bool: def open_page(self, url) -> bool:
return True return True
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
return 0, 0