kopia lustrzana https://github.com/bellingcat/auto-archiver
closes 305 and further fixes finding local downloads from uncommon ytdlp extractors
rodzic
d99b7c9efe
commit
b9f7927a3b
|
@ -1,3 +1,4 @@
|
||||||
|
import mimetypes
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -11,6 +12,7 @@ from urllib.request import urlretrieve
|
||||||
|
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
from yt_dlp.extractor.common import InfoExtractor
|
from yt_dlp.extractor.common import InfoExtractor
|
||||||
|
from yt_dlp.utils import MaxDownloadsReached
|
||||||
import pysubs2
|
import pysubs2
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
@ -362,7 +364,12 @@ class GenericExtractor(Extractor):
|
||||||
# this time download
|
# this time download
|
||||||
ydl.params["getcomments"] = self.comments
|
ydl.params["getcomments"] = self.comments
|
||||||
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
try:
|
||||||
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||||
|
except MaxDownloadsReached: # proceed as normal once MaxDownloadsReached is raised
|
||||||
|
pass
|
||||||
|
logger.success(data)
|
||||||
|
|
||||||
if "entries" in data:
|
if "entries" in data:
|
||||||
entries = data.get("entries", [])
|
entries = data.get("entries", [])
|
||||||
if not len(entries):
|
if not len(entries):
|
||||||
|
@ -372,14 +379,25 @@ class GenericExtractor(Extractor):
|
||||||
entries = [data]
|
entries = [data]
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
|
|
||||||
|
def _helper_get_filename(entry: dict) -> str:
|
||||||
|
entry_url = entry.get("url")
|
||||||
|
|
||||||
|
filename = ydl.prepare_filename(entry)
|
||||||
|
base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext'
|
||||||
|
directory = os.path.dirname(base_filename) # '/get/path/to'
|
||||||
|
basename = os.path.basename(base_filename) # 'file'
|
||||||
|
for f in os.listdir(directory):
|
||||||
|
if f.startswith(basename) or (entry_url and os.path.splitext(f)[0] in entry_url) and "video/" in (mimetypes.guess_type(f)[0] or ""):
|
||||||
|
return os.path.join(directory, f)
|
||||||
|
return False
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
try:
|
try:
|
||||||
filename = ydl.prepare_filename(entry)
|
filename = _helper_get_filename(entry)
|
||||||
if not os.path.exists(filename):
|
logger.warning(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
|
||||||
filename = filename.split(".")[0] + ".mkv"
|
|
||||||
|
|
||||||
if not os.path.exists(filename):
|
if not filename or not os.path.exists(filename):
|
||||||
logger.warning(f"File {filename} does not exist (see yt-dlp logs), skipping this entry.")
|
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
new_media = Media(filename)
|
new_media = Media(filename)
|
||||||
|
@ -460,6 +478,13 @@ class GenericExtractor(Extractor):
|
||||||
|
|
||||||
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
||||||
|
|
||||||
|
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
||||||
|
if data.get("is_live", False) and not self.livestreams:
|
||||||
|
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||||
|
return False
|
||||||
|
# it's a valid video, that the youtubdedl can download out of the box
|
||||||
|
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
||||||
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||||
|
@ -467,11 +492,12 @@ class GenericExtractor(Extractor):
|
||||||
|
|
||||||
# don't download since it can be a live stream
|
# don't download since it can be a live stream
|
||||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||||
if data.get("is_live", False) and not self.livestreams:
|
|
||||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||||
return False
|
|
||||||
# it's a valid video, that the youtubdedl can download out of the box
|
except MaxDownloadsReached:
|
||||||
result = self.get_metadata_for_video(data, info_extractor, url, ydl)
|
# yt-dlp raises an error when the max downloads limit is reached, and it shouldn't for our purposes, so we consider that a success
|
||||||
|
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if info_extractor.IE_NAME == "generic":
|
if info_extractor.IE_NAME == "generic":
|
||||||
|
|
Ładowanie…
Reference in New Issue