kopia lustrzana https://github.com/bellingcat/auto-archiver
Fix S3 storage to media in whisper_enricher.py.
rodzic
2920cf685f
commit
950624dd4b
|
@ -1,4 +1,4 @@
|
||||||
{
|
a={
|
||||||
"name": "Whisper Enricher",
|
"name": "Whisper Enricher",
|
||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
|
@ -12,7 +12,9 @@
|
||||||
"help": "WhisperApi api key for authentication"},
|
"help": "WhisperApi api key for authentication"},
|
||||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||||
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
|
"action": {"default": "translate",
|
||||||
|
"help": "which Whisper operation to execute",
|
||||||
|
"choices": ["transcribe", "translate", "language_detection"]},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
|
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
|
||||||
|
@ -27,6 +29,7 @@
|
||||||
### Notes
|
### Notes
|
||||||
- Requires a Whisper API endpoint and API key for authentication.
|
- Requires a Whisper API endpoint and API key for authentication.
|
||||||
- Only compatible with S3-compatible storage systems for media file accessibility.
|
- Only compatible with S3-compatible storage systems for media file accessibility.
|
||||||
|
- ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
|
||||||
- Handles multiple jobs and retries for failed or incomplete processing.
|
- Handles multiple jobs and retries for failed or incomplete processing.
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,17 +15,21 @@ class WhisperEnricher(Enricher):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
if not self._get_s3_storage():
|
storages = self.config['steps']['storages']
|
||||||
|
if not "s3_storage" in storages:
|
||||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
self.s3 = get_module("s3_storage", self.config)
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
||||||
|
|
||||||
job_results = {}
|
job_results = {}
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if m.is_video() or m.is_audio():
|
if m.is_video() or m.is_audio():
|
||||||
m.store(url=url, metadata=to_enrich, storages=self.storages)
|
# TODO: this used to pass all storage items to store now
|
||||||
|
# Now only passing S3, the rest will get added later in the usual order (?)
|
||||||
|
m.store(url=url, metadata=to_enrich, storages=[self.s3])
|
||||||
try:
|
try:
|
||||||
job_id = self.submit_job(m)
|
job_id = self.submit_job(m)
|
||||||
job_results[job_id] = False
|
job_results[job_id] = False
|
||||||
|
@ -53,8 +57,8 @@ class WhisperEnricher(Enricher):
|
||||||
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
||||||
|
|
||||||
def submit_job(self, media: Media):
|
def submit_job(self, media: Media):
|
||||||
s3 = get_module("s3_storage", self.config)
|
|
||||||
s3_url = s3.get_cdn_url(media)
|
s3_url = self.s3.get_cdn_url(media)
|
||||||
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
||||||
payload = {
|
payload = {
|
||||||
"url": s3_url,
|
"url": s3_url,
|
||||||
|
@ -107,10 +111,3 @@ class WhisperEnricher(Enricher):
|
||||||
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
||||||
return result
|
return result
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _get_s3_storage(self) -> S3Storage:
|
|
||||||
try:
|
|
||||||
return next(s for s in self.config['steps']['storages'] if s == 's3_storage')
|
|
||||||
except:
|
|
||||||
logger.warning("No S3Storage instance found in storages")
|
|
||||||
return
|
|
||||||
|
|
Ładowanie…
Reference in New Issue