From 6ca46417feeda7f6ac586214cbf40917f9d9b50f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 12 Jan 2023 02:09:39 +0000 Subject: [PATCH] local storage + multiple storage support --- src/archivers/telethon_archiverv2.py | 4 +- src/databases/gsheet_db.py | 4 +- src/enrichers/wayback_enricher.py | 2 +- src/formatters/templates/html_template.html | 19 +++++---- src/media.py | 28 +++++++++---- src/metadata.py | 2 - src/orchestrator.py | 14 +++---- src/storages/__init__.py | 3 +- src/storages/local.py | 46 +++++++++++++++++++++ src/storages/s3.py | 25 +++++------ src/storages/storage.py | 24 ++++++----- 11 files changed, 117 insertions(+), 54 deletions(-) create mode 100644 src/storages/local.py diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 66ecd74..6851cb5 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2): if mp.entities: other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]] if len(other_media_urls): - logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}") for i, om_url in enumerate(other_media_urls): filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') self.download_from_url(om_url, filename) - result.add_media(Media(filename)) + result.add_media(Media(filename=filename, id=f"{group_id}_{i}")) filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index 26aae68..0cf65ed 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -63,13 +63,13 @@ class GsheetsDb(Database): media: Media = item.get_single_media() - batch_if_valid('archive', media.cdn_url) + batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('timestamp', item.get_timestamp()) if (screenshot := item.get_media_by_id("screenshot")): - batch_if_valid('screenshot', screenshot.cdn_url) + batch_if_valid('screenshot', "\n".join(screenshot.urls)) # batch_if_valid('status', item.status) # TODO: AFTER ENRICHMENTS diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py index 09a43e0..bf55923 100644 --- a/src/enrichers/wayback_enricher.py +++ b/src/enrichers/wayback_enricher.py @@ -21,7 +21,7 @@ class WaybackEnricher(Enricher): @staticmethod def configs() -> dict: return { - "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."}, + "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} } diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index fc986f0..f488a5f 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -26,6 +26,7 @@ table, th, td { + margin: auto; border: 1px solid; border-collapse: collapse; } @@ -43,18 +44,17 @@
title: '{{ title }}'
+title: '{{ title }}'
about | -preview | +preview(s) |
---|---|---|
|
+ {% for url in m.urls %}
{% if 'image' in m.mimetype %}
- |
key | @@ -100,7 +103,7 @@
---|
made with bellingcat/auto-archiver, add suggestions and report issues on the project's github page
+Made with bellingcat/auto-archiver