Small fixups + implement Truth code for posts with multiple media

pull/175/head
Patrick Robertson 2025-01-20 18:40:46 +01:00
rodzic fd2e7f973b
commit dff0105659
3 zmienionych plików z 22 dodań i 9 usunięć

Wyświetl plik

@ -181,7 +181,8 @@ class GenericArchiver(Archiver):
return self.create_metadata_for_post(ie_instance, post_data, url) return self.create_metadata_for_post(ie_instance, post_data, url)
def get_metatdata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata: def get_metadata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
# this time download # this time download
ydl.params['getcomments'] = self.comments ydl.params['getcomments'] = self.comments
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded? #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
@ -233,13 +234,18 @@ class GenericArchiver(Archiver):
result = False result = False
try: try:
if info_extractor.ie_key() == "Truth":
# the ytdlp truth extractor currently only gets the first image/video in the 'media' section, as opposed to all of them
# we don't want this
raise yt_dlp.utils.ExtractorError("Use the 'post data' method for Truth posts")
# don't download since it can be a live stream # don't download since it can be a live stream
info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False) info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
if info.get('is_live', False) and not self.livestreams: if info.get('is_live', False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting") logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
return False return False
# it's a valid video, that the youtubdedl can download out of the box # it's a valid video, that the youtubdedl can download out of the box
result = self.get_metatdata_for_video(info, info_extractor, url, ydl) result = self.get_metadata_for_video(info, info_extractor, url, ydl)
except Exception as e: except Exception as e:
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead') logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')

Wyświetl plik

@ -1,31 +1,39 @@
import datetime import datetime
from auto_archiver.utils import clean_html, traverse_obj from auto_archiver.utils import traverse_obj
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.archivers.archiver import Archiver from auto_archiver.archivers.archiver import Archiver
from dateutil.parser import parse as parse_dt
def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata: def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
""" """
Creates metaata from a truth social post Creates metadata from a truth social post
Only used for posts that contains no media. ytdlp.TruthIE extractor can handle posts with media Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
Format is: Format is:
{'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []} {'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
""" """
breakpoint()
result = Metadata() result = Metadata()
result.set_url(url) result.set_url(url)
timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
result.set_timestamp(datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")) result.set_timestamp(parse_dt(timestamp))
result.set('description', post['content']) result.set('description', post['content'])
result.set('author', post['account']['username']) result.set('author', post['account']['username'])
for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']: for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
if isinstance(key, tuple): if isinstance(key, tuple):
store_key = u" ".join(key) store_key = " ".join(key)
else: else:
store_key = key store_key = key
result.set(store_key, traverse_obj(post, key)) result.set(store_key, traverse_obj(post, key))
# add the media
for media in post.get('media_attachments', []):
filename = archiver.download_from_url(media['url'])
result.add_media(Media(filename), id=media.get('id'))
return result return result

Wyświetl plik

@ -125,7 +125,6 @@ class TestGenericArchiver(TestArchiverBase):
assert len(result.media) == 1 assert len(result.media) == 1
assert result is not False assert result is not False
@pytest.mark.skip("Currently failing, multiple images are not being downloaded - this is due to an issue with ytdlp extractor")
@pytest.mark.download @pytest.mark.download
def test_truthsocial_download_multiple_images(self, make_item): def test_truthsocial_download_multiple_images(self, make_item):
item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135") item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")