diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 3ed6629..87bdbf6 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -20,6 +20,7 @@ from loguru import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media from auto_archiver.utils import get_datetime_from_str +from auto_archiver.utils.misc import ydl_entry_to_filename from .dropin import GenericDropin @@ -382,27 +383,13 @@ class GenericExtractor(Extractor): entries = [data] result = Metadata() - def _helper_get_filename(entry: dict) -> str: - entry_url = entry.get("url") - filename = ydl.prepare_filename(entry) - base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext' - directory = os.path.dirname(base_filename) # '/get/path/to' - basename = os.path.basename(base_filename) # 'file' - for f in os.listdir(directory): - if ( - f.startswith(basename) - or (entry_url and os.path.splitext(f)[0] in entry_url) - and "video/" in (mimetypes.guess_type(f)[0] or "") - ): - return os.path.join(directory, f) - return False for entry in entries: try: - filename = _helper_get_filename(entry) + filename = ydl_entry_to_filename(ydl, entry) - if not filename or not os.path.exists(filename): + if not filename: # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies. continue diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index fe1864b..5b41a04 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -1,5 +1,6 @@ import hashlib import json +import mimetypes import os import uuid from datetime import datetime, timezone @@ -116,3 +117,26 @@ def get_timestamp(ts, utc=True, iso=True, dayfirst=True) -> str | datetime | Non def get_current_timestamp() -> str: return get_timestamp(datetime.now()) + + +def ydl_entry_to_filename(ydl, entry: dict) -> str: + import yt_dlp + + ydl: yt_dlp.YoutubeDL + entry_url = entry.get("url") + + filename = ydl.prepare_filename(entry) + if os.path.exists(filename): + return filename + + base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext' + directory = os.path.dirname(base_filename) # '/get/path/to' + basename = os.path.basename(base_filename) # 'file' + for f in os.listdir(directory): + if ( + f.startswith(basename) + or (entry_url and os.path.splitext(f)[0] in entry_url) + and "video/" in (mimetypes.guess_type(f)[0] or "") + ): + return os.path.join(directory, f) + return False \ No newline at end of file