From 950624dd4bb0e917abbe58c98351bbabd26d0bb3 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Feb 2025 20:26:00 +0000 Subject: [PATCH] Fix S3 storage to media in whisper_enricher.py. --- .../modules/whisper_enricher/__manifest__.py | 7 +++++-- .../whisper_enricher/whisper_enricher.py | 19 ++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 884de66..1539df6 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -1,4 +1,4 @@ -{ +a={ "name": "Whisper Enricher", "type": ["enricher"], "requires_setup": True, @@ -12,7 +12,9 @@ "help": "WhisperApi api key for authentication"}, "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, - "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, + "action": {"default": "translate", + "help": "which Whisper operation to execute", + "choices": ["transcribe", "translate", "language_detection"]}, }, "description": """ Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files. @@ -27,6 +29,7 @@ ### Notes - Requires a Whisper API endpoint and API key for authentication. - Only compatible with S3-compatible storage systems for media file accessibility. + - ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files. - Handles multiple jobs and retries for failed or incomplete processing. """ } diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a7298e4..004d91c 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -15,17 +15,21 @@ class WhisperEnricher(Enricher): """ def enrich(self, to_enrich: Metadata) -> None: - if not self._get_s3_storage(): + storages = self.config['steps']['storages'] + if not "s3_storage" in storages: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return + self.s3 = get_module("s3_storage", self.config) url = to_enrich.get_url() logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") job_results = {} for i, m in enumerate(to_enrich.media): if m.is_video() or m.is_audio(): - m.store(url=url, metadata=to_enrich, storages=self.storages) + # TODO: this used to pass all storage items to store now + # Now only passing S3, the rest will get added later in the usual order (?) + m.store(url=url, metadata=to_enrich, storages=[self.s3]) try: job_id = self.submit_job(m) job_results[job_id] = False @@ -53,8 +57,8 @@ class WhisperEnricher(Enricher): to_enrich.set_content(f"\n[automatic video transcript]: {v}") def submit_job(self, media: Media): - s3 = get_module("s3_storage", self.config) - s3_url = s3.get_cdn_url(media) + + s3_url = self.s3.get_cdn_url(media) assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls " payload = { "url": s3_url, @@ -107,10 +111,3 @@ class WhisperEnricher(Enricher): logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}") return result return False - - def _get_s3_storage(self) -> S3Storage: - try: - return next(s for s in self.config['steps']['storages'] if s == 's3_storage') - except: - logger.warning("No S3Storage instance found in storages") - return