Small fixups + implement Truth code for posts with multiple media

pull/175/head
Patrick Robertson 2025-01-20 18:40:46 +01:00
rodzic fd2e7f973b
commit dff0105659
3 zmienionych plików z 22 dodań i 9 usunięć

Wyświetl plik

@ -181,7 +181,8 @@ class GenericArchiver(Archiver):
return self.create_metadata_for_post(ie_instance, post_data, url)
def get_metatdata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
def get_metadata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
# this time download
ydl.params['getcomments'] = self.comments
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
@ -233,13 +234,18 @@ class GenericArchiver(Archiver):
result = False
try:
if info_extractor.ie_key() == "Truth":
# the ytdlp truth extractor currently only gets the first image/video in the 'media' section, as opposed to all of them
# we don't want this
raise yt_dlp.utils.ExtractorError("Use the 'post data' method for Truth posts")
# don't download since it can be a live stream
info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
if info.get('is_live', False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
return False
# it's a valid video, that the youtubdedl can download out of the box
result = self.get_metatdata_for_video(info, info_extractor, url, ydl)
result = self.get_metadata_for_video(info, info_extractor, url, ydl)
except Exception as e:
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')

Wyświetl plik

@ -1,31 +1,39 @@
import datetime
from auto_archiver.utils import clean_html, traverse_obj
from auto_archiver.core.metadata import Metadata
from auto_archiver.utils import traverse_obj
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.archivers.archiver import Archiver
from dateutil.parser import parse as parse_dt
def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
"""
Creates metaata from a truth social post
Creates metadata from a truth social post
Only used for posts that contains no media. ytdlp.TruthIE extractor can handle posts with media
Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
Format is:
{'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
"""
breakpoint()
result = Metadata()
result.set_url(url)
timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
result.set_timestamp(datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ"))
result.set_timestamp(parse_dt(timestamp))
result.set('description', post['content'])
result.set('author', post['account']['username'])
for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
if isinstance(key, tuple):
store_key = u" ".join(key)
store_key = " ".join(key)
else:
store_key = key
result.set(store_key, traverse_obj(post, key))
# add the media
for media in post.get('media_attachments', []):
filename = archiver.download_from_url(media['url'])
result.add_media(Media(filename), id=media.get('id'))
return result

Wyświetl plik

@ -125,7 +125,6 @@ class TestGenericArchiver(TestArchiverBase):
assert len(result.media) == 1
assert result is not False
@pytest.mark.skip("Currently failing, multiple images are not being downloaded - this is due to an issue with ytdlp extractor")
@pytest.mark.download
def test_truthsocial_download_multiple_images(self, make_item):
item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")