kopia lustrzana https://github.com/bellingcat/auto-archiver
more flexibility to antibot dropins media finding process
rodzic
d60d02c16e
commit
b60469767a
|
@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||||
self._enrich_download_media(
|
self._enrich_download_media(
|
||||||
sb,
|
sb,
|
||||||
to_enrich,
|
to_enrich,
|
||||||
css_selector=dropin.images_selectors(),
|
js_css_selector=dropin.js_for_image_css_selectors(),
|
||||||
max_media=self.max_download_images - downloaded_images,
|
max_media=self.max_download_images - downloaded_images,
|
||||||
)
|
)
|
||||||
self._enrich_download_media(
|
self._enrich_download_media(
|
||||||
sb,
|
sb,
|
||||||
to_enrich,
|
to_enrich,
|
||||||
css_selector=dropin.video_selectors(),
|
js_css_selector=dropin.js_for_video_css_selectors(),
|
||||||
max_media=self.max_download_videos - downloaded_videos,
|
max_media=self.max_download_videos - downloaded_videos,
|
||||||
)
|
)
|
||||||
logger.info(f"ANTIBOT completed for {url_sample}")
|
logger.info(f"ANTIBOT completed for {url_sample}")
|
||||||
|
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||||
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
|
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
|
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
|
||||||
"""
|
"""
|
||||||
Downloads media from the page and adds them to the Metadata object.
|
Downloads media from the page and adds them to the Metadata object.
|
||||||
This method is called by the enrich method.
|
This method is called by the enrich method.
|
||||||
|
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
all_urls = set()
|
all_urls = set()
|
||||||
|
|
||||||
sources = sb.execute_script(f"""
|
sources = sb.execute_script(js_css_selector)
|
||||||
return Array.from(document.querySelectorAll("{css_selector}"))
|
# js_for_css_selectors
|
||||||
.map(el => el.src || el.href)
|
|
||||||
.filter(Boolean);
|
|
||||||
""")
|
|
||||||
for src in sources:
|
for src in sources:
|
||||||
if len(all_urls) >= max_media:
|
if len(all_urls) >= max_media:
|
||||||
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
logger.debug(f"Reached max download limit of {max_media} images/videos.")
|
||||||
|
|
|
@ -53,6 +53,26 @@ class Dropin:
|
||||||
"""
|
"""
|
||||||
return "video, source"
|
return "video, source"
|
||||||
|
|
||||||
|
def js_for_image_css_selectors(self) -> str:
|
||||||
|
"""
|
||||||
|
A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
|
||||||
|
|
||||||
|
You can overwrite this instead of `images_selector` for more control over scraped images.
|
||||||
|
"""
|
||||||
|
return f"""
|
||||||
|
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||||
|
"""
|
||||||
|
|
||||||
|
def js_for_video_css_selectors(self) -> str:
|
||||||
|
"""
|
||||||
|
A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
|
||||||
|
|
||||||
|
You can overwrite this instead of `video_selector` for more control over scraped videos.
|
||||||
|
"""
|
||||||
|
return f"""
|
||||||
|
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||||
|
"""
|
||||||
|
|
||||||
def open_page(self, url) -> bool:
|
def open_page(self, url) -> bool:
|
||||||
"""
|
"""
|
||||||
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
|
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
|
||||||
|
@ -66,7 +86,7 @@ class Dropin:
|
||||||
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
|
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
|
||||||
:return: A tuple (number of Images added, number of Videos added).
|
:return: A tuple (number of Images added, number of Videos added).
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("This method should be implemented in the subclass")
|
return 0, 0
|
||||||
|
|
||||||
def _get_username_password(self, site) -> tuple[str, str]:
|
def _get_username_password(self, site) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from auto_archiver.core.metadata import Metadata
|
|
||||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,6 +12,3 @@ class DefaultDropin(Dropin):
|
||||||
|
|
||||||
def open_page(self, url) -> bool:
|
def open_page(self, url) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
|
|
||||||
return 0, 0
|
|
||||||
|
|
Ładowanie…
Reference in New Issue