kopia lustrzana https://github.com/bellingcat/auto-archiver
Create facebook dropin - working for images + text. CAVEAT: only gets the first ~100 chars of the post at the moment
rodzic
5211c5de18
commit
f8e846d59a
|
@ -1,3 +1,4 @@
|
|||
from typing import Type
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
|
@ -23,6 +24,8 @@ class GenericDropin:
|
|||
|
||||
"""
|
||||
|
||||
extractor: Type[Extractor] = None
|
||||
|
||||
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||
"""
|
||||
This method should return the post data from the url.
|
||||
|
@ -55,4 +58,11 @@ class GenericDropin:
|
|||
"""
|
||||
This method should download any additional media from the post.
|
||||
"""
|
||||
return metadata
|
||||
return metadata
|
||||
|
||||
def is_suitable(self, url, info_extractor: InfoExtractor):
|
||||
"""
|
||||
Used to override the InfoExtractor's 'is_suitable' method. Dropins should override this method to return True if the url is suitable for the extractor
|
||||
(based on being able to parse other URLs)
|
||||
"""
|
||||
return False
|
|
@ -1,18 +1,36 @@
|
|||
import re
|
||||
from .dropin import GenericDropin
|
||||
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.media import Media
|
||||
|
||||
class Facebook(GenericDropin):
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
video_id = ie_instance._match_valid_url(url).group('id')
|
||||
ie_instance._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
|
||||
|
||||
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
||||
post_data = ie_instance._extract_metadata(webpage)
|
||||
return post_data
|
||||
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
post_id_regex = r'(?P<id>pfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))'
|
||||
post_id = re.search(post_id_regex, url).group('id')
|
||||
webpage = ie_instance._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), post_id)
|
||||
|
||||
# WARN: Will only work once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
||||
# TODO: For long posts, this _extract_metadata only seems to return the first 100 or so characters, followed by ...
|
||||
post_data = ie_instance._extract_metadata(webpage, post_id)
|
||||
return post_data
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
metadata = archiver.create_metadata(url)
|
||||
metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
|
||||
return metadata
|
||||
result = Metadata()
|
||||
result.set_content(post.get('description', ''))
|
||||
result.set_title(post.get('title', ''))
|
||||
result.set('author', post.get('uploader', ''))
|
||||
result.set_url(url)
|
||||
return result
|
||||
|
||||
def is_suitable(self, url, info_extractor):
|
||||
regex = r'(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)'
|
||||
return re.match(regex, url)
|
||||
|
||||
def skip_ytdlp_download(self, url: str, ie_instance):
|
||||
"""
|
||||
Skip using the ytdlp download method for Facebook *photo* posts, they have a URL with an id of t.XXXXX/XXXXX
|
||||
"""
|
||||
if re.search(r'/t.\d+/\d+', url):
|
||||
return True
|
|
@ -8,6 +8,8 @@ from loguru import logger
|
|||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
class Skip(Exception):
|
||||
pass
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
|
@ -15,8 +17,20 @@ class GenericExtractor(Extractor):
|
|||
"""
|
||||
Returns a list of valid extractors for the given URL"""
|
||||
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
|
||||
if info_extractor.suitable(url) and info_extractor.working():
|
||||
if not info_extractor.working():
|
||||
continue
|
||||
|
||||
# check if there's a dropin and see if that declares whether it's suitable
|
||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||
if dropin and dropin.is_suitable(url, info_extractor):
|
||||
yield info_extractor
|
||||
continue
|
||||
|
||||
if info_extractor.suitable(url):
|
||||
yield info_extractor
|
||||
continue
|
||||
|
||||
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
"""
|
||||
|
@ -129,7 +143,8 @@ class GenericExtractor(Extractor):
|
|||
return False
|
||||
|
||||
post_data = dropin.extract_post(url, ie_instance)
|
||||
return dropin.create_metadata(post_data, ie_instance, self, url)
|
||||
result = dropin.create_metadata(post_data, ie_instance, self, url)
|
||||
return self.add_metadata(post_data, info_extractor, url, result)
|
||||
|
||||
def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||
|
||||
|
@ -181,6 +196,7 @@ class GenericExtractor(Extractor):
|
|||
dropin_class_name = dropin_name.title()
|
||||
def _load_dropin(dropin):
|
||||
dropin_class = getattr(dropin, dropin_class_name)()
|
||||
dropin.extractor = self
|
||||
return self._dropins.setdefault(dropin_name, dropin_class)
|
||||
|
||||
try:
|
||||
|
@ -225,8 +241,9 @@ class GenericExtractor(Extractor):
|
|||
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
||||
|
||||
try:
|
||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
|
||||
raise Exception(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
||||
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()} (dropin override)")
|
||||
raise Skip()
|
||||
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
|
@ -240,15 +257,17 @@ class GenericExtractor(Extractor):
|
|||
if info_extractor.ie_key() == "generic":
|
||||
# don't clutter the logs with issues about the 'generic' extractor not having a dropin
|
||||
return False
|
||||
|
||||
if not isinstance(e, Skip):
|
||||
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead')
|
||||
|
||||
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
logger.error(f'Error downloading metadata for post: {post_e}')
|
||||
return False
|
||||
except Exception as generic_e:
|
||||
logger.debug(f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}', exc_info=True)
|
||||
logger.debug(f'Attempt to extract using ytdlp dropin for "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}', exc_info=True)
|
||||
return False
|
||||
|
||||
if result:
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
"save_absolute": {"default": True, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer's file structure)"},
|
||||
},
|
||||
"description": """
|
||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||
|
|
|
@ -39,6 +39,17 @@ class TestGenericExtractor(TestExtractorBase):
|
|||
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("url, suitable_extractors", [
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),])
|
||||
def test_suitable_extractors(self, url, suitable_extractors):
|
||||
suitable_extractors = suitable_extractors + ['generic'] # the generic is valid for all
|
||||
extractors = list(self.extractor.suitable_extractors(url))
|
||||
assert len(extractors) == len(suitable_extractors)
|
||||
assert [e.ie_key().lower() for e in extractors] == suitable_extractors
|
||||
|
||||
@pytest.mark.parametrize("url, is_suitable", [
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
|
||||
|
@ -48,7 +59,7 @@ class TestGenericExtractor(TestExtractorBase):
|
|||
("https://www.twitch.tv/videos/1167226570", True),
|
||||
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
|
||||
("https://google.com", True)])
|
||||
def test_suitable_urls(self, make_item, url, is_suitable):
|
||||
def test_suitable_urls(self, url, is_suitable):
|
||||
"""
|
||||
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
|
||||
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
|
||||
|
@ -209,4 +220,33 @@ class TestGenericExtractor(TestExtractorBase):
|
|||
timestamp
|
||||
)
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].hash == image_hash
|
||||
assert post.media[0].hash == image_hash
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_video(self, make_item):
|
||||
|
||||
post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
|
||||
assert len(post.media) == 2
|
||||
assert post.media[0].filename.endswith("588371253839133.mp4")
|
||||
assert post.media[0].mimetype == "video/mp4"
|
||||
|
||||
assert post.media[1].filename.endswith(".jpg")
|
||||
assert post.media[1].mimetype == "image/jpeg"
|
||||
|
||||
assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_image(self, make_item):
|
||||
|
||||
post = self.extractor.download(make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/"))
|
||||
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].filename.endswith(".png")
|
||||
assert "Byline Festival - BylineFest Partner" == post.get_title()
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_text_only(self, make_item):
|
||||
url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
|
||||
post = self.extractor.download(make_item(url))
|
||||
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get('content')
|
||||
assert post.get_title() == "Bellingcat"
|
||||
|
|
Ładowanie…
Reference in New Issue