diff --git a/Dockerfile b/Dockerfile
index 5db284a..96b8405 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,16 +18,17 @@ RUN pip install --upgrade pip && \
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
# RUN curl -fsSL https://get.docker.com | sh
-# RUN git clone https://github.com/bellingcat/auto-archiver
# TODO: avoid copying unnecessary files, including .git
COPY Pipfile Pipfile.lock ./
RUN pipenv install --python=3.10 --system --deploy
ENV IS_DOCKER=1
COPY ./src/ .
-# CMD ["pipenv", "run", "python", "auto_archive.py"]
-ENTRYPOINT ["python", "auto_archive.py"]
+# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
+# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
+# USER archiver
+ENTRYPOINT ["python"]
# ENTRYPOINT ["docker-entrypoint.sh"]
-# should be executed with 2 volumes
+# should be executed with 2 volumes (3 if local_storage)
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
\ No newline at end of file
diff --git a/src/archivers/base_archiver.py b/src/archivers/base_archiver.py
index 5ef2b7e..75395b5 100644
--- a/src/archivers/base_archiver.py
+++ b/src/archivers/base_archiver.py
@@ -1,8 +1,9 @@
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from random import randrange
+from collections import defaultdict
import ffmpeg
from loguru import logger
@@ -27,6 +28,7 @@ class ArchiveResult:
screenshot: str = None
wacz: str = None
hash: str = None
+ media: list = field(default_factory=list)
class Archiver(ABC):
name = "default"
@@ -38,6 +40,7 @@ class Archiver(ABC):
self.hash_algorithm = config.hash_algorithm
self.browsertrix = config.browsertrix_config
self.is_docker = config.is_docker
+ self.media = []
def __str__(self):
return self.__class__.__name__
@@ -48,13 +51,28 @@ class Archiver(ABC):
@abstractmethod
def download(self, url, check_if_exists=False): pass
+ def generateArchiveResult(self, **kwargs):
+ # remove duplicates
+ if "cdn_url" in kwargs:
+ self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
+ kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
+ return ArchiveResult(**kwargs)
+
def get_netloc(self, url):
return urlparse(url).netloc
+ def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
+ media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
+ if key: media_info["key"] = key
+ if hash: media_info["hash"] = hash
+ self.media.append(media_info)
+
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
"""
Generates an index.html page where each @urls_info is displayed
"""
+ for ui in urls_info:
+ self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
page = f'''
{url}
Archived media from {self.name}
@@ -109,6 +127,8 @@ class Archiver(ABC):
For a list of media urls, fetch them, upload them
and call self.generate_media_page_html with them
"""
+ for media_url in urls:
+ self.add_to_media(media_url)
thumbnail = None
uploaded_media = []
@@ -201,17 +221,20 @@ class Archiver(ABC):
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
- return self.storage.get_cdn_url(key)
+ cdn_url = self.storage.get_cdn_url(key)
+ self.add_to_media(cdn_url, key)
+
+ return cdn_url
def get_wacz(self, url):
if not self.browsertrix.enabled:
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
- return
+ return
if self.is_docker:
# TODO: figure out support for browsertrix in docker
# see: https://github.com/bellingcat/auto-archiver/issues/66
logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
- return
+ return
logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
@@ -220,7 +243,7 @@ class Archiver(ABC):
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
cmd = [
"docker", "run",
- "--rm", # delete container once it has completed running
+ "--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
@@ -253,18 +276,19 @@ class Archiver(ABC):
# do not crash if upload fails
try:
self.storage.upload(filename, key, extra_args={
- 'ACL': 'public-read', 'ContentType': 'application/zip'})
+ 'ACL': 'public-read', 'ContentType': 'application/zip'})
except FileNotFoundError as e:
logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
-
# clean up the local browsertrix files
try:
shutil.rmtree(browsertrix_home)
except PermissionError:
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
- return self.storage.get_cdn_url(key)
+ cdn_url = self.storage.get_cdn_url(key)
+ self.add_to_media(cdn_url, key)
+ return cdn_url
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
diff --git a/src/archivers/instagram_archiver.py b/src/archivers/instagram_archiver.py
index a2b1147..62db876 100644
--- a/src/archivers/instagram_archiver.py
+++ b/src/archivers/instagram_archiver.py
@@ -52,7 +52,7 @@ class InstagramArchiver(Archiver):
cdn_url = self.storage.get_cdn_url(key)
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
try:
# process if post
@@ -137,4 +137,4 @@ class InstagramArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/archivers/telegram_archiver.py b/src/archivers/telegram_archiver.py
index 026bdd0..c6d8747 100644
--- a/src/archivers/telegram_archiver.py
+++ b/src/archivers/telegram_archiver.py
@@ -47,7 +47,7 @@ class TelegramArchiver(Archiver):
time_elements = s.find_all('time')
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
video_url = video.get('src')
video_id = video_url.split('/')[-1].split('?')[0]
@@ -85,5 +85,5 @@ class TelegramArchiver(Archiver):
os.remove(filename)
cdn_url = self.storage.get_cdn_url(key)
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+ return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py
index 5c147de..f0ff194 100644
--- a/src/archivers/telethon_archiver.py
+++ b/src/archivers/telethon_archiver.py
@@ -80,7 +80,7 @@ class TelethonArchiver(Archiver):
if check_if_exists and self.storage.exists(key):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
- return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
key_thumb, thumb_index = None, None
group_id = post.grouped_id if post.grouped_id is not None else post.id
@@ -119,7 +119,7 @@ class TelethonArchiver(Archiver):
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
- return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
+ return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
- return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/archivers/tiktok_archiver.py b/src/archivers/tiktok_archiver.py
index bdaad52..55cb97e 100644
--- a/src/archivers/tiktok_archiver.py
+++ b/src/archivers/tiktok_archiver.py
@@ -28,9 +28,9 @@ class TiktokArchiver(Archiver):
if len(media) <= 0:
if status == 'already archived':
- return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
+ return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
else:
- return ArchiveResult(status='Could not download media')
+ return self.generateArchiveResult(status='Could not download media')
logger.info(f'downloading video {key=}')
media[0].download(filename)
@@ -56,17 +56,17 @@ class TiktokArchiver(Archiver):
cdn_url = self.storage.get_cdn_url(key)
timestamp = info.create.isoformat() if hasattr(info, "create") else None
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+ return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL'
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
- return ArchiveResult(status=status)
+ return self.generateArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
logger.warning(f'Other Tiktok error' + str(error))
- return ArchiveResult(status=status)
+ return self.generateArchiveResult(status=status)
diff --git a/src/archivers/twitter_api_archiver.py b/src/archivers/twitter_api_archiver.py
index 454cfe2..da56d31 100644
--- a/src/archivers/twitter_api_archiver.py
+++ b/src/archivers/twitter_api_archiver.py
@@ -40,7 +40,7 @@ class TwitterApiArchiver(TwitterArchiver):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
screenshot = self.get_screenshot(url)
- return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
+ return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
urls = []
if tweet.includes:
@@ -72,4 +72,4 @@ class TwitterApiArchiver(TwitterArchiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
diff --git a/src/archivers/twitter_archiver.py b/src/archivers/twitter_archiver.py
index b868af5..f1f22c0 100644
--- a/src/archivers/twitter_archiver.py
+++ b/src/archivers/twitter_archiver.py
@@ -41,7 +41,7 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
- return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
urls = []
@@ -62,7 +62,7 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
def download_alternative(self, url, tweet_id):
# https://stackoverflow.com/a/71867055/6196010
@@ -87,7 +87,7 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
def choose_variant(self, variants):
# choosing the highest quality possible
diff --git a/src/archivers/vk_archiver.py b/src/archivers/vk_archiver.py
index 91b8354..1d38fa9 100644
--- a/src/archivers/vk_archiver.py
+++ b/src/archivers/vk_archiver.py
@@ -31,7 +31,7 @@ class VkArchiver(Archiver):
# if check_if_exists and self.storage.exists(key):
# screenshot = self.get_screenshot(url)
# cdn_url = self.storage.get_cdn_url(key)
- # return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
+ # return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
if len(results) == 0:
@@ -71,4 +71,4 @@ class VkArchiver(Archiver):
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
diff --git a/src/archivers/wayback_archiver.py b/src/archivers/wayback_archiver.py
index e0ede90..1bfa78a 100644
--- a/src/archivers/wayback_archiver.py
+++ b/src/archivers/wayback_archiver.py
@@ -39,7 +39,7 @@ class WaybackArchiver(Archiver):
if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}")
- return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
if 'job_id' not in r.json() and 'message' in r.json():
return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
@@ -61,7 +61,7 @@ class WaybackArchiver(Archiver):
retries += 1
if status_r.status_code != 200:
- return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
status_json = status_r.json()
if status_json['status'] != 'success':
@@ -77,7 +77,7 @@ class WaybackArchiver(Archiver):
title = 'Could not get title'
except:
title = "Could not get title"
- self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
+ self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
return self.seen_urls[url]
def custom_retry(self, json_data, **kwargs):
@@ -86,4 +86,4 @@ class WaybackArchiver(Archiver):
return self.signal_retry_in(**kwargs)
if "this host has been already captured" in str(json_data).lower():
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
- return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
+ return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
diff --git a/src/archivers/youtubedl_archiver.py b/src/archivers/youtubedl_archiver.py
index 5d09442..e2f27a2 100644
--- a/src/archivers/youtubedl_archiver.py
+++ b/src/archivers/youtubedl_archiver.py
@@ -38,7 +38,7 @@ class YoutubeDLArchiver(Archiver):
if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now")
- return ArchiveResult(status="Streaming media")
+ return self.generateArchiveResult(status="Streaming media")
if 'twitter.com' in netloc:
if 'https://twitter.com/' in info['webpage_url']:
@@ -114,5 +114,5 @@ class YoutubeDLArchiver(Archiver):
elif 'upload_date' in info and info['upload_date'] is not None:
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+ return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/auto_archive.py b/src/auto_archive.py
index 3412b0a..a797405 100644
--- a/src/auto_archive.py
+++ b/src/auto_archive.py
@@ -57,7 +57,7 @@ def missing_required_columns(gw: GWorksheet):
return missing
-def should_process_sheet(c, sheet_name):
+def should_process_sheet(c: Config, sheet_name):
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
# ALLOW rules exist AND sheet name not explicitly allowed
return False
@@ -67,6 +67,50 @@ def should_process_sheet(c, sheet_name):
return True
+def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
+ url = expand_url(url)
+ c.set_folder(folder)
+ storage = c.get_storage()
+
+ # make a new driver so each spreadsheet row is idempotent
+ c.recreate_webdriver()
+
+ # order matters, first to succeed excludes remaining
+ active_archivers = [
+ TelethonArchiver(storage, c),
+ TiktokArchiver(storage, c),
+ TwitterApiArchiver(storage, c),
+ InstagramArchiver(storage, c),
+ YoutubeDLArchiver(storage, c),
+ TelegramArchiver(storage, c),
+ TwitterArchiver(storage, c),
+ VkArchiver(storage, c),
+ WaybackArchiver(storage, c)
+ ]
+
+ for archiver in active_archivers:
+ logger.debug(f'Trying {archiver} on {debug_string}')
+
+ try:
+ result = archiver.download(url, check_if_exists=c.check_if_exists)
+ except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
+ except Exception as e:
+ result = False
+ logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
+
+ if result:
+ success = result.status in ['success', 'already archived']
+ result.status = f"{archiver.name}: {result.status}"
+ if success:
+ logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
+ break
+ # only 1 retry possible for now
+ if is_retry and Archiver.is_retry(result.status):
+ result.status = Archiver.remove_retry(result.status)
+ logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
+ return result
+
+
def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet)
@@ -100,46 +144,7 @@ def process_sheet(c: Config):
# All checks done - archival process starts here
try:
gw.set_cell(row, 'status', 'Archive in progress')
- url = expand_url(url)
- c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
-
- # make a new driver so each spreadsheet row is idempotent
- c.recreate_webdriver()
-
- # order matters, first to succeed excludes remaining
- active_archivers = [
- TelethonArchiver(storage, c),
- TiktokArchiver(storage, c),
- TwitterApiArchiver(storage, c),
- InstagramArchiver(storage, c),
- YoutubeDLArchiver(storage, c),
- TelegramArchiver(storage, c),
- TwitterArchiver(storage, c),
- VkArchiver(storage, c),
- WaybackArchiver(storage, c)
- ]
-
- for archiver in active_archivers:
- logger.debug(f'Trying {archiver} on {row=}')
-
- try:
- result = archiver.download(url, check_if_exists=c.check_if_exists)
- except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
- except Exception as e:
- result = False
- logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
-
- if result:
- success = result.status in ['success', 'already archived']
- result.status = f"{archiver.name}: {result.status}"
- if success:
- logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
- break
- # only 1 retry possible for now
- if is_retry and Archiver.is_retry(result.status):
- result.status = Archiver.remove_retry(result.status)
- logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
-
+ result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
if result:
update_sheet(gw, row, url, result)
else:
diff --git a/src/cli.py b/src/cli.py
index e69de29..b6d2b70 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -0,0 +1,30 @@
+import tempfile, json
+import auto_archive
+from loguru import logger
+from configs import Config
+from storages import Storage
+from slugify import slugify
+
+
+def main():
+ c = Config()
+ c.parse()
+ url = c.url
+ if not url:
+ logger.error("Invalid URL: '{url}'")
+ return
+ logger.info(f'Archiving "{url=}".')
+ with tempfile.TemporaryDirectory(dir="./") as tmpdir:
+ Storage.TMP_FOLDER = tmpdir
+ result = auto_archive.archive_url(c, url, "", f"{url=}", False)
+ c.destroy_webdriver()
+ key = f"media_{slugify(url)}.json"
+ with open(key, "w", encoding="utf-8") as outf:
+ json.dump(result.media, outf, ensure_ascii=False, indent=4)
+ c.get_storage().upload(key, key)
+ print(result)
+ return result
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/configs/config.py b/src/configs/config.py
index 01b8173..bbd385e 100644
--- a/src/configs/config.py
+++ b/src/configs/config.py
@@ -47,6 +47,8 @@ class Config:
with open(self.config_file, "r", encoding="utf-8") as inf:
self.config = yaml.safe_load(inf)
+ self.url = getattr_or(self.args, "url", '')
+
# ----------------------EXECUTION - execution configurations
execution = self.config.get("execution", {})
@@ -211,6 +213,7 @@ class Config:
"""
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
+ parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction')
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
diff --git a/src/storages/base_storage.py b/src/storages/base_storage.py
index cde00fe..f147678 100644
--- a/src/storages/base_storage.py
+++ b/src/storages/base_storage.py
@@ -1,3 +1,4 @@
+import os, uuid
from loguru import logger
from abc import ABC, abstractmethod
from pathlib import Path
@@ -18,6 +19,14 @@ class Storage(ABC):
@abstractmethod
def uploadf(self, file, key, **kwargs): pass
+ def clean_key(self, key):
+ # Some storages does not work well with trailing forward slashes and some keys come with that
+ if key.startswith('/'):
+ logger.debug(f'Found and fixed a leading "/" for {key=}')
+ return key[1:]
+ return key
+
+
def upload(self, filename: str, key: str, **kwargs):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
with open(filename, 'rb') as f:
diff --git a/src/storages/gd_storage.py b/src/storages/gd_storage.py
index 5f3bbeb..3af77f1 100644
--- a/src/storages/gd_storage.py
+++ b/src/storages/gd_storage.py
@@ -116,13 +116,6 @@ class GDStorage(Storage):
# GD only requires the filename not a file reader
self.uploadf(filename, key, **kwargs)
- def clean_key(self, key):
- # GDrive does not work well with trailing forward slashes and some keys come with that
- if key.startswith('/'):
- logger.debug(f'Found and fixed a leading "/" for {key=}')
- return key[1:]
- return key
-
# gets the Drive folderID if it is there
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
"""
diff --git a/src/storages/local_storage.py b/src/storages/local_storage.py
index ca328e0..1109767 100644
--- a/src/storages/local_storage.py
+++ b/src/storages/local_storage.py
@@ -1,6 +1,7 @@
import os
from dataclasses import dataclass
+from loguru import logger
from .base_storage import Storage
from utils import mkdir_if_not_exists
@@ -18,8 +19,12 @@ class LocalStorage(Storage):
mkdir_if_not_exists(self.save_to)
def get_cdn_url(self, key):
+ key = self.clean_key(key)
+ logger.info(f"{key=}")
full_path = os.path.join(self.save_to, self.folder, key)
- mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
+ logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}")
+ os.makedirs(os.path.dirname(full_path), exist_ok=True)
+ # mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
return os.path.abspath(full_path)
def exists(self, key):