kopia lustrzana https://github.com/bellingcat/auto-archiver
more flexibility to antibot dropins media finding process
rodzic
d60d02c16e
commit
b60469767a
|
@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
self._enrich_download_media(
|
||||
sb,
|
||||
to_enrich,
|
||||
css_selector=dropin.images_selectors(),
|
||||
js_css_selector=dropin.js_for_image_css_selectors(),
|
||||
max_media=self.max_download_images - downloaded_images,
|
||||
)
|
||||
self._enrich_download_media(
|
||||
sb,
|
||||
to_enrich,
|
||||
css_selector=dropin.video_selectors(),
|
||||
js_css_selector=dropin.js_for_video_css_selectors(),
|
||||
max_media=self.max_download_videos - downloaded_videos,
|
||||
)
|
||||
logger.info(f"ANTIBOT completed for {url_sample}")
|
||||
|
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
|
||||
|
||||
@logger.catch
|
||||
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
|
||||
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
|
||||
"""
|
||||
Downloads media from the page and adds them to the Metadata object.
|
||||
This method is called by the enrich method.
|
||||
|
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||
url = to_enrich.get_url()
|
||||
all_urls = set()
|
||||
|
||||
sources = sb.execute_script(f"""
|
||||
return Array.from(document.querySelectorAll("{css_selector}"))
|
||||
.map(el => el.src || el.href)
|
||||
.filter(Boolean);
|
||||
""")
|
||||
sources = sb.execute_script(js_css_selector)
|
||||
# js_for_css_selectors
|
||||
for src in sources:
|
||||
if len(all_urls) >= max_media:
|
||||
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
||||
|
|
|
@ -53,6 +53,26 @@ class Dropin:
|
|||
"""
|
||||
return "video, source"
|
||||
|
||||
def js_for_image_css_selectors(self) -> str:
|
||||
"""
|
||||
A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
|
||||
|
||||
You can overwrite this instead of `images_selector` for more control over scraped images.
|
||||
"""
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def js_for_video_css_selectors(self) -> str:
|
||||
"""
|
||||
A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
|
||||
|
||||
You can overwrite this instead of `video_selector` for more control over scraped videos.
|
||||
"""
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
"""
|
||||
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
|
||||
|
@ -66,7 +86,7 @@ class Dropin:
|
|||
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
|
||||
:return: A tuple (number of Images added, number of Videos added).
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
return 0, 0
|
||||
|
||||
def _get_username_password(self, site) -> tuple[str, str]:
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
|
||||
|
@ -13,6 +12,3 @@ class DefaultDropin(Dropin):
|
|||
|
||||
def open_page(self, url) -> bool:
|
||||
return True
|
||||
|
||||
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
|
||||
return 0, 0
|
||||
|
|
Ładowanie…
Reference in New Issue