kopia lustrzana https://github.com/bellingcat/auto-archiver
Further cleanup, abstracts 'dropins' out into generic files
rodzic
dff0105659
commit
4bb4ebdf82
|
@ -7,69 +7,75 @@ from loguru import logger
|
||||||
from auto_archiver.core.context import ArchivingContext
|
from auto_archiver.core.context import ArchivingContext
|
||||||
from auto_archiver.archivers.archiver import Archiver
|
from auto_archiver.archivers.archiver import Archiver
|
||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
|
from .dropin import GenericDropin, InfoExtractor
|
||||||
|
|
||||||
|
class Bluesky(GenericDropin):
|
||||||
|
|
||||||
|
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||||
|
result = Metadata()
|
||||||
|
result.set_url(url)
|
||||||
|
result.set_title(post["record"]["text"])
|
||||||
|
result.set_timestamp(post["record"]["createdAt"])
|
||||||
|
for k, v in self._get_post_data(post).items():
|
||||||
|
if v: result.set(k, v)
|
||||||
|
|
||||||
|
# download if embeds present (1 video XOR >=1 images)
|
||||||
|
for media in self._download_bsky_embeds(post, archiver):
|
||||||
|
result.add_media(media)
|
||||||
|
logger.debug(f"Downloaded {len(result.media)} media files")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
|
||||||
|
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
|
||||||
|
return ie_instance._extract_post(handle=handle, post_id=video_id)
|
||||||
|
|
||||||
|
def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
|
||||||
|
"""
|
||||||
|
Iterates over image(s) or video in a Bluesky post and downloads them
|
||||||
|
"""
|
||||||
|
media = []
|
||||||
|
embed = post.get("record", {}).get("embed", {})
|
||||||
|
image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
|
||||||
|
video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
|
||||||
|
|
||||||
|
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
||||||
|
for image_media in image_medias:
|
||||||
|
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
|
||||||
|
image_media = archiver.download_from_url(url)
|
||||||
|
media.append(image_media)
|
||||||
|
for video_media in video_medias:
|
||||||
|
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
|
||||||
|
video_media = archiver.download_from_url(url)
|
||||||
|
media.append(video_media)
|
||||||
|
return media
|
||||||
|
|
||||||
|
|
||||||
def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
|
def _get_post_data(self, post: dict) -> dict:
|
||||||
result = Metadata()
|
"""
|
||||||
result.set_url(url)
|
Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
|
||||||
result.set_title(post["record"]["text"])
|
"""
|
||||||
result.set_timestamp(post["record"]["createdAt"])
|
author = post["author"]
|
||||||
for k, v in _get_post_data(post).items():
|
if "labels" in author and not author["labels"]:
|
||||||
if v: result.set(k, v)
|
del author["labels"]
|
||||||
|
if "associated" in author:
|
||||||
|
del author["associated"]
|
||||||
|
|
||||||
# download if embeds present (1 video XOR >=1 images)
|
mentions, tags, links = [], [], []
|
||||||
for media in _download_bsky_embeds(post, archiver):
|
facets = post.get("record", {}).get("facets", [])
|
||||||
result.add_media(media)
|
for f in facets:
|
||||||
logger.debug(f"Downloaded {len(result.media)} media files")
|
for feature in f["features"]:
|
||||||
|
if feature["$type"] == "app.bsky.richtext.facet#mention":
|
||||||
return result
|
mentions.append(feature["did"])
|
||||||
|
elif feature["$type"] == "app.bsky.richtext.facet#tag":
|
||||||
def _download_bsky_embeds(post: dict, archiver: Archiver) -> list[Media]:
|
tags.append(feature["tag"])
|
||||||
"""
|
elif feature["$type"] == "app.bsky.richtext.facet#link":
|
||||||
Iterates over image(s) or video in a Bluesky post and downloads them
|
links.append(feature["uri"])
|
||||||
"""
|
res = {"author": author}
|
||||||
media = []
|
if mentions:
|
||||||
embed = post.get("record", {}).get("embed", {})
|
res["mentions"] = mentions
|
||||||
image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
|
if tags:
|
||||||
video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
|
res["tags"] = tags
|
||||||
|
if links:
|
||||||
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
res["links"] = links
|
||||||
for image_media in image_medias:
|
return res
|
||||||
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
|
|
||||||
image_media = archiver.download_from_url(url)
|
|
||||||
media.append(image_media)
|
|
||||||
for video_media in video_medias:
|
|
||||||
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
|
|
||||||
video_media = archiver.download_from_url(url)
|
|
||||||
media.append(video_media)
|
|
||||||
return media
|
|
||||||
|
|
||||||
|
|
||||||
def _get_post_data(post: dict) -> dict:
|
|
||||||
"""
|
|
||||||
Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
|
|
||||||
"""
|
|
||||||
author = post["author"]
|
|
||||||
if "labels" in author and not author["labels"]:
|
|
||||||
del author["labels"]
|
|
||||||
if "associated" in author:
|
|
||||||
del author["associated"]
|
|
||||||
|
|
||||||
mentions, tags, links = [], [], []
|
|
||||||
facets = post.get("record", {}).get("facets", [])
|
|
||||||
for f in facets:
|
|
||||||
for feature in f["features"]:
|
|
||||||
if feature["$type"] == "app.bsky.richtext.facet#mention":
|
|
||||||
mentions.append(feature["did"])
|
|
||||||
elif feature["$type"] == "app.bsky.richtext.facet#tag":
|
|
||||||
tags.append(feature["tag"])
|
|
||||||
elif feature["$type"] == "app.bsky.richtext.facet#link":
|
|
||||||
links.append(feature["uri"])
|
|
||||||
res = {"author": author}
|
|
||||||
if mentions:
|
|
||||||
res["mentions"] = mentions
|
|
||||||
if tags:
|
|
||||||
res["tags"] = tags
|
|
||||||
if links:
|
|
||||||
res["links"] = links
|
|
||||||
return res
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
from yt_dlp.extractor.common import InfoExtractor
|
||||||
|
from auto_archiver.core.metadata import Metadata
|
||||||
|
from auto_archiver.archivers.archiver import Archiver
|
||||||
|
|
||||||
|
class GenericDropin:
|
||||||
|
"""Base class for dropins for the generic extractor.
|
||||||
|
|
||||||
|
In many instances, an extractor will exist in ytdlp, but it will only process videos.
|
||||||
|
Dropins can be created and used to make use of the already-written private code of a
|
||||||
|
specific extractor from ytdlp.
|
||||||
|
|
||||||
|
The dropin should be able to handle the following methods:
|
||||||
|
|
||||||
|
- `get_post_data`: This method should be able to extract the post data from the url and return it as a dict.
|
||||||
|
- `create_metadata`: This method should be able to create a Metadata object from a post dict.
|
||||||
|
|
||||||
|
Optional methods include:
|
||||||
|
|
||||||
|
- `skip_ytdlp_download`: If you want to skip the ytdlp 'download' method all together, and do your own, then return True for this method.
|
||||||
|
This is useful in cases where ytdlp might not work properly for all of your posts
|
||||||
|
- `keys_to_clean`: for the generic 'video_data' created by ytdlp (for video URLs), any additional fields you would like to clean out of the data before storing in metadata
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||||
|
"""
|
||||||
|
This method should return the post data from the url.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("This method should be implemented in the subclass")
|
||||||
|
|
||||||
|
|
||||||
|
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||||
|
"""
|
||||||
|
This method should create a Metadata object from the post data.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("This method should be implemented in the subclass")
|
||||||
|
|
||||||
|
|
||||||
|
def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
|
||||||
|
"""
|
||||||
|
This method should return True if you want to skip the ytdlp download method.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
def keys_to_clean(self, video_data: dict, info_extractor: InfoExtractor):
|
||||||
|
"""
|
||||||
|
This method should return a list of strings (keys) to clean from the video_data dict.
|
||||||
|
|
||||||
|
E.g. ["uploader", "uploader_id", "tiktok_specific_field"]
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata):
|
||||||
|
"""
|
||||||
|
This method should download any additional media from the post.
|
||||||
|
"""
|
||||||
|
return metadata
|
|
@ -1,16 +1,16 @@
|
||||||
import datetime, os, yt_dlp, pysubs2
|
import datetime, os, yt_dlp, pysubs2
|
||||||
|
import importlib
|
||||||
from typing import Type
|
from typing import Type
|
||||||
from yt_dlp.extractor.common import InfoExtractor
|
from yt_dlp.extractor.common import InfoExtractor
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import bluesky, twitter, truth
|
|
||||||
from auto_archiver.archivers.archiver import Archiver
|
from auto_archiver.archivers.archiver import Archiver
|
||||||
from ...core import Metadata, Media, ArchivingContext
|
from ...core import Metadata, Media, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class GenericArchiver(Archiver):
|
class GenericArchiver(Archiver):
|
||||||
name = "youtubedl_archiver" #left as is for backwards compat
|
name = "youtubedl_archiver" #left as is for backwards compat
|
||||||
|
_dropins = {}
|
||||||
|
|
||||||
def __init__(self, config: dict) -> None:
|
def __init__(self, config: dict) -> None:
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -22,23 +22,22 @@ class GenericArchiver(Archiver):
|
||||||
self.allow_playlist = bool(self.allow_playlist)
|
self.allow_playlist = bool(self.allow_playlist)
|
||||||
self.max_downloads = self.max_downloads
|
self.max_downloads = self.max_downloads
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def configs() -> dict:
|
|
||||||
return {
|
|
||||||
"facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
|
|
||||||
"subtitles": {"default": True, "help": "download subtitles if available"},
|
|
||||||
"comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
|
|
||||||
"livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
|
|
||||||
"live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
|
|
||||||
"proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
|
|
||||||
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
|
|
||||||
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
|
|
||||||
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
|
|
||||||
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
|
|
||||||
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
|
|
||||||
}
|
|
||||||
|
|
||||||
def download_additional_media(self, extractor_key: str, video_data: dict, metadata: Metadata) -> Metadata:
|
def suitable_extractors(self, url: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Returns a list of valid extractors for the given URL"""
|
||||||
|
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
|
||||||
|
if info_extractor.suitable(url) and info_extractor.working():
|
||||||
|
yield info_extractor
|
||||||
|
|
||||||
|
def suitable(self, url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Checks for valid URLs out of all ytdlp extractors.
|
||||||
|
Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
|
||||||
|
"""
|
||||||
|
return any(self.suitable_extractors(url))
|
||||||
|
|
||||||
|
def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata) -> Metadata:
|
||||||
"""
|
"""
|
||||||
Downloads additional media like images, comments, subtitles, etc.
|
Downloads additional media like images, comments, subtitles, etc.
|
||||||
|
|
||||||
|
@ -56,11 +55,18 @@ class GenericArchiver(Archiver):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
|
logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
|
||||||
|
|
||||||
|
dropin = self.dropin_for_extractor(info_extractor)
|
||||||
|
if dropin:
|
||||||
|
try:
|
||||||
|
metadata = dropin.download_additional_media(video_data, info_extractor, metadata)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
def keys_to_clean(self, extractor_key: str, video_data: dict) -> dict:
|
def keys_to_clean(self, info_extractor: InfoExtractor, video_data: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Clean up the video data to make it more readable and remove unnecessary keys that ytdlp adds
|
Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
|
||||||
"""
|
"""
|
||||||
|
|
||||||
base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
|
base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
|
||||||
|
@ -71,23 +77,23 @@ class GenericArchiver(Archiver):
|
||||||
'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
|
'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
|
||||||
'_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
|
'_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
|
||||||
'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
|
'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
|
||||||
if extractor_key == 'TikTok':
|
|
||||||
# Tiktok: only has videos so a valid ytdlp `video_data` object is returned. Base keys are enough
|
|
||||||
return base_keys + []
|
|
||||||
elif extractor_key == "Bluesky":
|
|
||||||
# bluesky API response for non video URLs is already clean, nothing to add
|
|
||||||
return base_keys + []
|
|
||||||
|
|
||||||
|
dropin = self.dropin_for_extractor(info_extractor)
|
||||||
|
if dropin:
|
||||||
|
try:
|
||||||
|
base_keys += dropin.keys_to_clean(video_data, info_extractor)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
return base_keys
|
return base_keys
|
||||||
|
|
||||||
def add_metadata(self, extractor_key: str, video_data: dict, url:str, result: Metadata) -> Metadata:
|
def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url:str, result: Metadata) -> Metadata:
|
||||||
"""
|
"""
|
||||||
Creates a Metadata object from the give video_data
|
Creates a Metadata object from the given video_data
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# first add the media
|
# first add the media
|
||||||
result = self.download_additional_media(extractor_key, video_data, result)
|
result = self.download_additional_media(video_data, info_extractor, result)
|
||||||
|
|
||||||
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
|
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
|
||||||
result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
|
result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
|
||||||
|
@ -110,7 +116,7 @@ class GenericArchiver(Archiver):
|
||||||
result.set("upload_date", upload_date)
|
result.set("upload_date", upload_date)
|
||||||
|
|
||||||
# then clean away any keys we don't want
|
# then clean away any keys we don't want
|
||||||
for clean_key in self.keys_to_clean(extractor_key, video_data):
|
for clean_key in self.keys_to_clean(info_extractor, video_data):
|
||||||
video_data.pop(clean_key, None)
|
video_data.pop(clean_key, None)
|
||||||
|
|
||||||
# then add the rest of the video data
|
# then add the rest of the video data
|
||||||
|
@ -120,35 +126,6 @@ class GenericArchiver(Archiver):
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def suitable_extractors(self, url: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Returns a list of valid extractors for the given URL"""
|
|
||||||
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
|
|
||||||
if info_extractor.suitable(url) and info_extractor.working():
|
|
||||||
yield info_extractor
|
|
||||||
|
|
||||||
def suitable(self, url: str) -> bool:
|
|
||||||
"""
|
|
||||||
Checks for valid URLs out of all ytdlp extractors.
|
|
||||||
Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
|
|
||||||
"""
|
|
||||||
return any(self.suitable_extractors(url))
|
|
||||||
|
|
||||||
def create_metadata_for_post(self, info_extractor: InfoExtractor, post_data: dict, url: str) -> Metadata:
|
|
||||||
"""
|
|
||||||
Standardizes the output of the 'post' data from a ytdlp InfoExtractor to Metadata object.
|
|
||||||
|
|
||||||
This is only required for platforms that don't have videos, and therefore cannot be converted into ytdlp valid 'video_data'.
|
|
||||||
In these instances, we need to use the extractor's _extract_post (or similar) method to get the post metadata, and then convert
|
|
||||||
it into a Metadata object via a platform-specific function.
|
|
||||||
"""
|
|
||||||
if info_extractor.ie_key() == 'Bluesky':
|
|
||||||
return bluesky.create_metadata(post_data, self, url)
|
|
||||||
if info_extractor.ie_key() == 'Twitter':
|
|
||||||
return twitter.create_metadata(post_data, self, url)
|
|
||||||
if info_extractor.ie_key() == 'Truth':
|
|
||||||
return truth.create_metadata(post_data, self, url)
|
|
||||||
|
|
||||||
def get_metatdata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
def get_metatdata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||||
"""
|
"""
|
||||||
Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
|
Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
|
||||||
|
@ -156,45 +133,29 @@ class GenericArchiver(Archiver):
|
||||||
|
|
||||||
ie_instance = info_extractor(downloader=ydl)
|
ie_instance = info_extractor(downloader=ydl)
|
||||||
post_data = None
|
post_data = None
|
||||||
|
dropin = self.dropin_for_extractor(info_extractor)
|
||||||
|
if not dropin:
|
||||||
|
# TODO: add a proper link to 'how to create your own dropin'
|
||||||
|
logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
|
||||||
|
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
||||||
|
return False
|
||||||
|
|
||||||
if info_extractor.ie_key() == 'Bluesky':
|
post_data = dropin.extract_post(url, ie_instance)
|
||||||
# bluesky kwargs are handle, video_id
|
return dropin.create_metadata(post_data, ie_instance, self, url)
|
||||||
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
|
|
||||||
post_data = ie_instance._extract_post(handle=handle, post_id=video_id)
|
|
||||||
elif info_extractor.ie_key() == 'Twitter':
|
|
||||||
# twitter kwargs are tweet_id
|
|
||||||
twid = ie_instance._match_valid_url(url).group('id')
|
|
||||||
# TODO: if ytdlp PR https://github.com/yt-dlp/yt-dlp/pull/12098 is merged, change to _extract_post
|
|
||||||
post_data = ie_instance._extract_status(twid=twid)
|
|
||||||
elif info_extractor.ie_key() == 'Truth':
|
|
||||||
video_id = ie_instance._match_id(url)
|
|
||||||
truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
|
|
||||||
post_data = ie_instance._download_json(truthsocial_url, video_id)
|
|
||||||
else:
|
|
||||||
# lame attempt at trying to get data for an unknown extractor
|
|
||||||
# TODO: test some more video platforms and see if there's any improvement to be made
|
|
||||||
try:
|
|
||||||
post_data = ie_instance._extract_post(url)
|
|
||||||
except (NotImplementedError, AttributeError) as e:
|
|
||||||
logger.debug(f"Extractor {info_extractor.ie_key()} does not support extracting post info from non-video URLs: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return self.create_metadata_for_post(ie_instance, post_data, url)
|
def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||||
|
|
||||||
def get_metadata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
|
||||||
|
|
||||||
# this time download
|
# this time download
|
||||||
ydl.params['getcomments'] = self.comments
|
ydl.params['getcomments'] = self.comments
|
||||||
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||||
info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||||
if "entries" in info:
|
if "entries" in data:
|
||||||
entries = info.get("entries", [])
|
entries = data.get("entries", [])
|
||||||
if not len(entries):
|
if not len(entries):
|
||||||
logger.warning('YoutubeDLArchiver could not find any video')
|
logger.warning('YoutubeDLArchiver could not find any video')
|
||||||
return False
|
return False
|
||||||
else: entries = [info]
|
else: entries = [data]
|
||||||
|
|
||||||
extractor_key = info['extractor_key']
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
@ -209,7 +170,7 @@ class GenericArchiver(Archiver):
|
||||||
|
|
||||||
# read text from subtitles if enabled
|
# read text from subtitles if enabled
|
||||||
if self.subtitles:
|
if self.subtitles:
|
||||||
for lang, val in (info.get('requested_subtitles') or {}).items():
|
for lang, val in (data.get('requested_subtitles') or {}).items():
|
||||||
try:
|
try:
|
||||||
subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
|
subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
|
||||||
text = " ".join([line.text for line in subs])
|
text = " ".join([line.text for line in subs])
|
||||||
|
@ -220,9 +181,49 @@ class GenericArchiver(Archiver):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing entry {entry}: {e}")
|
logger.error(f"Error processing entry {entry}: {e}")
|
||||||
|
|
||||||
return self.add_metadata(extractor_key, info, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
|
|
||||||
def download_for_extractor(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
def dropin_for_extractor(self, info_extractor: Type[InfoExtractor], additional_paths = []):
|
||||||
|
dropin_name = info_extractor.ie_key().lower()
|
||||||
|
|
||||||
|
if dropin_name == "generic":
|
||||||
|
# no need for a dropin for the generic extractor (?)
|
||||||
|
return None
|
||||||
|
|
||||||
|
dropin_class_name = dropin_name.title()
|
||||||
|
def _load_dropin(dropin):
|
||||||
|
dropin_class = getattr(dropin, dropin_class_name)()
|
||||||
|
return self._dropins.setdefault(dropin_name, dropin_class)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return self._dropins[dropin_name]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# TODO: user should be able to pass --dropins="/some/folder,/other/folder" as a cmd line option
|
||||||
|
# which would allow the user to override the default dropins/add their own
|
||||||
|
paths = [] + additional_paths
|
||||||
|
for path in paths:
|
||||||
|
dropin_path = os.path.join(path, f"{dropin_name}.py")
|
||||||
|
dropin_spec = importlib.util.spec_from_file_location(dropin_name, dropin_path)
|
||||||
|
if not dropin_spec:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
dropin = importlib.util.module_from_spec(dropin_spec)
|
||||||
|
dropin_spec.loader.exec_module(dropin)
|
||||||
|
return _load_dropin(dropin)
|
||||||
|
except (FileNotFoundError, ModuleNotFoundError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# fallback to loading the dropins within auto-archiver
|
||||||
|
try:
|
||||||
|
return _load_dropin(importlib.import_module(f".{dropin_name}", package=__package__))
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def download_for_extractor(self, info_extractor: InfoExtractor, url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||||
"""
|
"""
|
||||||
Tries to download the given url using the specified extractor
|
Tries to download the given url using the specified extractor
|
||||||
|
|
||||||
|
@ -233,19 +234,19 @@ class GenericArchiver(Archiver):
|
||||||
ydl.params['getcomments'] = False
|
ydl.params['getcomments'] = False
|
||||||
result = False
|
result = False
|
||||||
|
|
||||||
|
dropin_submodule = self.dropin_for_extractor(info_extractor)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if info_extractor.ie_key() == "Truth":
|
if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
|
||||||
# the ytdlp truth extractor currently only gets the first image/video in the 'media' section, as opposed to all of them
|
raise Exception(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||||
# we don't want this
|
|
||||||
raise yt_dlp.utils.ExtractorError("Use the 'post data' method for Truth posts")
|
|
||||||
|
|
||||||
# don't download since it can be a live stream
|
# don't download since it can be a live stream
|
||||||
info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||||
if info.get('is_live', False) and not self.livestreams:
|
if data.get('is_live', False) and not self.livestreams:
|
||||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||||
return False
|
return False
|
||||||
# it's a valid video, that the youtubdedl can download out of the box
|
# it's a valid video, that the youtubdedl can download out of the box
|
||||||
result = self.get_metadata_for_video(info, info_extractor, url, ydl)
|
result = self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
|
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
|
||||||
|
|
|
@ -1,39 +1,52 @@
|
||||||
import datetime
|
from typing import Type
|
||||||
|
|
||||||
from auto_archiver.utils import traverse_obj
|
from auto_archiver.utils import traverse_obj
|
||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
from auto_archiver.archivers.archiver import Archiver
|
from auto_archiver.archivers.archiver import Archiver
|
||||||
|
from yt_dlp.extractor.common import InfoExtractor
|
||||||
|
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
|
|
||||||
def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
|
from .dropin import GenericDropin
|
||||||
"""
|
|
||||||
Creates metadata from a truth social post
|
|
||||||
|
|
||||||
Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
|
class Truth(GenericDropin):
|
||||||
|
|
||||||
Format is:
|
def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
|
||||||
|
video_id = ie_instance._match_id(url)
|
||||||
|
truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
|
||||||
|
return ie_instance._download_json(truthsocial_url, video_id)
|
||||||
|
|
||||||
{'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
|
def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
|
||||||
"""
|
return True
|
||||||
breakpoint()
|
|
||||||
result = Metadata()
|
|
||||||
result.set_url(url)
|
|
||||||
timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
|
|
||||||
result.set_timestamp(parse_dt(timestamp))
|
|
||||||
result.set('description', post['content'])
|
|
||||||
result.set('author', post['account']['username'])
|
|
||||||
|
|
||||||
for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
|
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||||
if isinstance(key, tuple):
|
"""
|
||||||
store_key = " ".join(key)
|
Creates metadata from a truth social post
|
||||||
else:
|
|
||||||
store_key = key
|
|
||||||
result.set(store_key, traverse_obj(post, key))
|
|
||||||
|
|
||||||
# add the media
|
Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
|
||||||
for media in post.get('media_attachments', []):
|
|
||||||
filename = archiver.download_from_url(media['url'])
|
|
||||||
result.add_media(Media(filename), id=media.get('id'))
|
|
||||||
|
|
||||||
return result
|
Format is:
|
||||||
|
|
||||||
|
{'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = Metadata()
|
||||||
|
result.set_url(url)
|
||||||
|
timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
|
||||||
|
result.set_timestamp(parse_dt(timestamp))
|
||||||
|
result.set('description', post['content'])
|
||||||
|
result.set('author', post['account']['username'])
|
||||||
|
|
||||||
|
for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
|
||||||
|
if isinstance(key, tuple):
|
||||||
|
store_key = " ".join(key)
|
||||||
|
else:
|
||||||
|
store_key = key
|
||||||
|
result.set(store_key, traverse_obj(post, key))
|
||||||
|
|
||||||
|
# add the media
|
||||||
|
for media in post.get('media_attachments', []):
|
||||||
|
filename = archiver.download_from_url(media['url'])
|
||||||
|
result.add_media(Media(filename), id=media.get('id'))
|
||||||
|
|
||||||
|
return result
|
|
@ -8,55 +8,63 @@ from auto_archiver.core.metadata import Metadata, Media
|
||||||
from auto_archiver.utils import UrlUtil
|
from auto_archiver.utils import UrlUtil
|
||||||
from auto_archiver.archivers.archiver import Archiver
|
from auto_archiver.archivers.archiver import Archiver
|
||||||
|
|
||||||
|
from .dropin import GenericDropin, InfoExtractor
|
||||||
|
|
||||||
def choose_variant(variants):
|
class Twitter(GenericDropin):
|
||||||
# choosing the highest quality possible
|
|
||||||
variant, width, height = None, 0, 0
|
|
||||||
for var in variants:
|
|
||||||
if var.get("content_type", "") == "video/mp4":
|
|
||||||
width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
|
|
||||||
if width_height:
|
|
||||||
w, h = int(width_height[1]), int(width_height[2])
|
|
||||||
if w > width or h > height:
|
|
||||||
width, height = w, h
|
|
||||||
variant = var
|
|
||||||
else:
|
|
||||||
variant = var if not variant else variant
|
|
||||||
return variant
|
|
||||||
|
|
||||||
def create_metadata(tweet: dict, archiver: Archiver, url: str) -> Metadata:
|
|
||||||
result = Metadata()
|
|
||||||
try:
|
|
||||||
if not tweet.get("user") or not tweet.get("created_at"):
|
|
||||||
raise ValueError(f"Error retreiving post. Are you sure it exists?")
|
|
||||||
timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
|
||||||
except (ValueError, KeyError) as ex:
|
|
||||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
result\
|
def choose_variant(self, variants):
|
||||||
.set_title(tweet.get('full_text', ''))\
|
# choosing the highest quality possible
|
||||||
.set_content(json.dumps(tweet, ensure_ascii=False))\
|
variant, width, height = None, 0, 0
|
||||||
.set_timestamp(timestamp)
|
for var in variants:
|
||||||
if not tweet.get("entities", {}).get("media"):
|
if var.get("content_type", "") == "video/mp4":
|
||||||
logger.debug('No media found, archiving tweet text only')
|
width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
|
||||||
result.status = "twitter-ytdl"
|
if width_height:
|
||||||
|
w, h = int(width_height[1]), int(width_height[2])
|
||||||
|
if w > width or h > height:
|
||||||
|
width, height = w, h
|
||||||
|
variant = var
|
||||||
|
else:
|
||||||
|
variant = var if not variant else variant
|
||||||
|
return variant
|
||||||
|
|
||||||
|
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||||
|
twid = ie_instance._match_valid_url(url).group('id')
|
||||||
|
return ie_instance._extract_status(twid=twid)
|
||||||
|
|
||||||
|
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||||
|
result = Metadata()
|
||||||
|
try:
|
||||||
|
if not tweet.get("user") or not tweet.get("created_at"):
|
||||||
|
raise ValueError(f"Error retreiving post. Are you sure it exists?")
|
||||||
|
timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
|
except (ValueError, KeyError) as ex:
|
||||||
|
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
result\
|
||||||
|
.set_title(tweet.get('full_text', ''))\
|
||||||
|
.set_content(json.dumps(tweet, ensure_ascii=False))\
|
||||||
|
.set_timestamp(timestamp)
|
||||||
|
if not tweet.get("entities", {}).get("media"):
|
||||||
|
logger.debug('No media found, archiving tweet text only')
|
||||||
|
result.status = "twitter-ytdl"
|
||||||
|
return result
|
||||||
|
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||||
|
media = Media(filename="")
|
||||||
|
mimetype = ""
|
||||||
|
if tw_media["type"] == "photo":
|
||||||
|
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
|
||||||
|
mimetype = "image/jpeg"
|
||||||
|
elif tw_media["type"] == "video":
|
||||||
|
variant = self.choose_variant(tw_media['video_info']['variants'])
|
||||||
|
media.set("src", variant['url'])
|
||||||
|
mimetype = variant['content_type']
|
||||||
|
elif tw_media["type"] == "animated_gif":
|
||||||
|
variant = tw_media['video_info']['variants'][0]
|
||||||
|
media.set("src", variant['url'])
|
||||||
|
mimetype = variant['content_type']
|
||||||
|
ext = mimetypes.guess_extension(mimetype)
|
||||||
|
media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
||||||
|
result.add_media(media)
|
||||||
return result
|
return result
|
||||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
|
||||||
media = Media(filename="")
|
|
||||||
mimetype = ""
|
|
||||||
if tw_media["type"] == "photo":
|
|
||||||
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
|
|
||||||
mimetype = "image/jpeg"
|
|
||||||
elif tw_media["type"] == "video":
|
|
||||||
variant = choose_variant(tw_media['video_info']['variants'])
|
|
||||||
media.set("src", variant['url'])
|
|
||||||
mimetype = variant['content_type']
|
|
||||||
elif tw_media["type"] == "animated_gif":
|
|
||||||
variant = tw_media['video_info']['variants'][0]
|
|
||||||
media.set("src", variant['url'])
|
|
||||||
mimetype = variant['content_type']
|
|
||||||
ext = mimetypes.guess_extension(mimetype)
|
|
||||||
media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
|
||||||
result.add_media(media)
|
|
||||||
return result
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
from loguru import logger
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from . import Feeder
|
||||||
|
from ..core import Metadata, ArchivingContext
|
||||||
|
from ..utils import url_or_none
|
||||||
|
|
||||||
|
class CSVFeeder(Feeder):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {
|
||||||
|
"files": {
|
||||||
|
"default": None,
|
||||||
|
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||||
|
Input files should be formatted with one URL per line",
|
||||||
|
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||||
|
},
|
||||||
|
"column": {
|
||||||
|
"default": None,
|
||||||
|
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def __iter__(self) -> Metadata:
|
||||||
|
url_column = self.column or 0
|
||||||
|
for file in self.files:
|
||||||
|
with open(file, "r") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
first_row = next(reader)
|
||||||
|
if not(url_or_none(first_row[url_column])):
|
||||||
|
# it's a header row, skip it
|
||||||
|
logger.debug(f"Skipping header row: {first_row}")
|
||||||
|
for row in reader:
|
||||||
|
url = row[0]
|
||||||
|
logger.debug(f"Processing {url}")
|
||||||
|
yield Metadata().set_url(url)
|
||||||
|
ArchivingContext.set("folder", "cli")
|
||||||
|
|
||||||
|
logger.success(f"Processed {len(self.urls)} URL(s)")
|
|
@ -7,4 +7,4 @@ from .url import UrlUtil
|
||||||
from .atlos import get_atlos_config_options
|
from .atlos import get_atlos_config_options
|
||||||
|
|
||||||
# handy utils from ytdlp
|
# handy utils from ytdlp
|
||||||
from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none)
|
from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none, url_or_none)
|
Ładowanie…
Reference in New Issue