From dd07b0b830c1ef8d8a053259bb47f26aa70af8ac Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 24 Feb 2025 11:40:44 +0000 Subject: [PATCH] Allow flexible extractor_args in generic_extractor.py. --- .../modules/generic_extractor/__manifest__.py | 5 ++ .../generic_extractor/generic_extractor.py | 75 +++++++++++++++---- 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index caa3ae1..2936983 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -64,5 +64,10 @@ via the command line using the `--dropins` option (TODO!). "default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.", }, + "extractor_args": { + "default": {}, + "help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.", + "type": "json_loader", + }, }, } diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 47c03f6..8ceda27 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -170,8 +170,8 @@ class GenericExtractor(Extractor): logger.error(f"Error processing entry {entry}: {e}") return self.add_metadata(data, info_extractor, url, result) - - def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]: + + def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]: dropin_name = dropin_name.lower() if dropin_name == "generic": @@ -179,6 +179,7 @@ class GenericExtractor(Extractor): return None dropin_class_name = dropin_name.title() + def _load_dropin(dropin): dropin_class = getattr(dropin, dropin_class_name)() return self._dropins.setdefault(dropin_name, dropin_class) @@ -202,7 +203,7 @@ class GenericExtractor(Extractor): return _load_dropin(dropin) except (FileNotFoundError, ModuleNotFoundError): pass - + # fallback to loading the dropins within auto-archiver try: return _load_dropin(importlib.import_module(f".{dropin_name}", package=package)) @@ -241,7 +242,8 @@ class GenericExtractor(Extractor): # don't clutter the logs with issues about the 'generic' extractor not having a dropin return False - logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead') + logger.debug( + f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead') try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: @@ -273,15 +275,22 @@ class GenericExtractor(Extractor): ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), - 'quiet': False, 'noplaylist': not self.allow_playlist , - 'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles, - "live_from_start": self.live_from_start, "proxy": self.proxy, - "max_downloads": self.max_downloads, "playlistend": self.max_downloads} - - # set up auth - auth = self.auth_for_site(url, extract_cookies=False) + 'quiet': False, + 'noplaylist': not self.allow_playlist , + 'writesubtitles': self.subtitles, + 'writeautomaticsub': self.subtitles, + "live_from_start": self.live_from_start, + "proxy": self.proxy, + "max_downloads": self.max_downloads, + "playlistend": self.max_downloads, + # TODO + # "verbose": True, + # "print_traffic": True, + } - # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file + # Set up auth + auth = self.auth_for_site(url, extract_cookies=False) + # order of importance: username/password -> api_key -> cookie -> cookies_from_browser -> cookies_file if auth: if 'username' in auth and 'password' in auth: logger.debug(f'Using provided auth username and password for {url}') @@ -297,12 +306,46 @@ class GenericExtractor(Extractor): logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}') ydl_options['cookiefile'] = auth['cookies_file'] + + # Applying user-defined extractor_args + if self.extractor_args: + logger.info(f"Applying user-defined extractor_args") + ydl_options.setdefault('extractor_args', {}) + + for key, args in self.extractor_args.items(): + logger.debug(f"Setting extractor_args: {key}") + if isinstance(args, dict): + # Site specific arguments (e.g., youtube: somekey=value) + ydl_options['extractor_args'].setdefault(key, {}).update(args) + else: + # General extractor_args (e.g., somekey=value) + ydl_options['extractor_args'][key] = args + + ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" for info_extractor in self.suitable_extractors(url): - result = self.download_for_extractor(info_extractor, url, ydl) - if result: - return result - + try: + result = self.download_for_extractor(info_extractor, url, ydl) + if result: + return result + except yt_dlp.utils.ExtractorError as e: + # TODO Does this catch empty/ incomplete failures? + if self.extractor_args: + logger.warning( + f"Extraction with custom extractor_args failed for {url}. Retrying without extractor_args...") + # Remove extractor_args and try without + del ydl_options['extractor_args'] + ydl = yt_dlp.YoutubeDL(ydl_options) + try: + result = self.download_for_extractor(info_extractor, url, ydl) + if result: + return result + except Exception as retry_error: + logger.error(f"Extraction failed for {url} after retrying: {retry_error}") + return False + else: + logger.error(f"Extraction failed for {url}: {e}") + return False return False