WIP docker changes for cli and auto_archiver

2022-11-10 17:46:40 +00:00 · 2022-11-10 17:46:40 +00:00 · 04263094ad
commit 04263094ad
--- a/9
+++ b/9
@ -18,16 +18,17 @@ RUN pip install --upgrade pip && \
 # TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
 # RUN curl -fsSL https://get.docker.com | sh
 # RUN git clone https://github.com/bellingcat/auto-archiver
 # TODO: avoid copying unnecessary files, including .git
 COPY Pipfile Pipfile.lock ./
 RUN pipenv install --python=3.10 --system --deploy
 ENV IS_DOCKER=1
 COPY ./src/ . 
-# CMD ["pipenv", "run", "python", "auto_archive.py"]
+# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
-ENTRYPOINT ["python", "auto_archive.py"]
+# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
 # USER archiver
 ENTRYPOINT ["python"]
 # ENTRYPOINT ["docker-entrypoint.sh"]
-# should be executed with 2 volumes
+# should be executed with 2 volumes (3 if local_storage)
 # docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets  -v $PWD/local_archive:/app/local_archive aa --help
--- a/src/archivers/base_archiver.py
+++ b/src/archivers/base_archiver.py
@ -1,8 +1,9 @@
 import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from abc import ABC, abstractmethod
 from urllib.parse import urlparse
 from random import randrange
 from collections import defaultdict
 import ffmpeg
 from loguru import logger
@ -27,6 +28,7 @@ class ArchiveResult:
    screenshot: str = None
    wacz: str = None
    hash: str = None
    media: list = field(default_factory=list)
 class Archiver(ABC):
    name = "default"
@ -38,6 +40,7 @@ class Archiver(ABC):
        self.hash_algorithm = config.hash_algorithm
        self.browsertrix = config.browsertrix_config
        self.is_docker = config.is_docker
        self.media = []
    def __str__(self):
        return self.__class__.__name__
@ -48,13 +51,28 @@ class Archiver(ABC):
    @abstractmethod
    def download(self, url, check_if_exists=False): pass
    def generateArchiveResult(self, **kwargs):
        # remove duplicates
        if "cdn_url" in kwargs:
            self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
        kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
        return ArchiveResult(**kwargs)
    def get_netloc(self, url):
        return urlparse(url).netloc
    def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
        media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
        if key: media_info["key"] = key
        if hash: media_info["hash"] = hash
        self.media.append(media_info)
    def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
        """
        Generates an index.html page where each @urls_info is displayed
        """
        for ui in urls_info:
            self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
        page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
            <body>
            <h2>Archived media from {self.name}</h2>
@ -109,6 +127,8 @@ class Archiver(ABC):
        For a list of media urls, fetch them, upload them
        and call self.generate_media_page_html with them
        """
        for media_url in urls:
            self.add_to_media(media_url)
        thumbnail = None
        uploaded_media = []
@ -201,17 +221,20 @@ class Archiver(ABC):
        self.driver.save_screenshot(filename)
        self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
-        return self.storage.get_cdn_url(key)
+        cdn_url = self.storage.get_cdn_url(key)
        self.add_to_media(cdn_url, key)
        return cdn_url
    def get_wacz(self, url):
        if not self.browsertrix.enabled:
            logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
-            return 
+            return
        if self.is_docker:
            # TODO: figure out support for browsertrix in docker
            # see: https://github.com/bellingcat/auto-archiver/issues/66
            logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
-            return 
+            return
        logger.debug(f"getting wacz for {url}")
        key = self._get_key_from_url(url, ".wacz", append_datetime=True)
@ -220,7 +243,7 @@ class Archiver(ABC):
        browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
        cmd = [
            "docker", "run",
-            "--rm", # delete container once it has completed running
+            "--rm",  # delete container once it has completed running
            "-v", f"{browsertrix_home}:/crawls/",
            # "-it", # this leads to "the input device is not a TTY"
            "webrecorder/browsertrix-crawler", "crawl",
@ -253,18 +276,19 @@ class Archiver(ABC):
        # do not crash if upload fails
        try:
            self.storage.upload(filename, key, extra_args={
-                            'ACL': 'public-read', 'ContentType': 'application/zip'})
+                'ACL': 'public-read', 'ContentType': 'application/zip'})
        except FileNotFoundError as e:
            logger.warning(f"Unable to locate and upload WACZ  {filename=}, {key=}")
        # clean up the local browsertrix files
        try:
            shutil.rmtree(browsertrix_home)
        except PermissionError:
            logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
-        return self.storage.get_cdn_url(key)
+        cdn_url = self.storage.get_cdn_url(key)
        self.add_to_media(cdn_url, key)
        return cdn_url
    def get_thumbnails(self, filename, key, duration=None):
        thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
--- a/src/archivers/instagram_archiver.py
+++ b/src/archivers/instagram_archiver.py
@ -52,7 +52,7 @@ class InstagramArchiver(Archiver):
            cdn_url = self.storage.get_cdn_url(key)
            screenshot = self.get_screenshot(url)
            wacz = self.get_wacz(url)
-            return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
+            return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
        try:
            # process if post
@ -137,4 +137,4 @@ class InstagramArchiver(Archiver):
            screenshot = self.get_screenshot(url)
            wacz = self.get_wacz(url)
-            return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
+            return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
--- a/src/archivers/telegram_archiver.py
+++ b/src/archivers/telegram_archiver.py
@ -47,7 +47,7 @@ class TelegramArchiver(Archiver):
            time_elements = s.find_all('time')
            timestamp = time_elements[0].get('datetime') if len(time_elements) else None
-            return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
+            return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
        video_url = video.get('src')
        video_id = video_url.split('/')[-1].split('?')[0]
@ -85,5 +85,5 @@ class TelegramArchiver(Archiver):
        os.remove(filename)
        cdn_url = self.storage.get_cdn_url(key)
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+        return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
--- a/src/archivers/telethon_archiver.py
+++ b/src/archivers/telethon_archiver.py
@ -80,7 +80,7 @@ class TelethonArchiver(Archiver):
                if check_if_exists and self.storage.exists(key):
                    # only s3 storage supports storage.exists as not implemented on gd
                    cdn_url = self.storage.get_cdn_url(key)
-                    return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
+                    return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
                key_thumb, thumb_index = None, None
                group_id = post.grouped_id if post.grouped_id is not None else post.id
@ -119,7 +119,7 @@ class TelethonArchiver(Archiver):
                page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
-                return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
+                return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
            page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
-            return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
+            return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
--- a/src/archivers/tiktok_archiver.py
+++ b/src/archivers/tiktok_archiver.py
@ -28,9 +28,9 @@ class TiktokArchiver(Archiver):
            if len(media) <= 0:
                if status == 'already archived':
-                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
+                    return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
                else:
-                    return ArchiveResult(status='Could not download media')
+                    return self.generateArchiveResult(status='Could not download media')
            logger.info(f'downloading video {key=}')
            media[0].download(filename)
@ -56,17 +56,17 @@ class TiktokArchiver(Archiver):
            cdn_url = self.storage.get_cdn_url(key)
            timestamp = info.create.isoformat() if hasattr(info, "create") else None
-            return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+            return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
                                 thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
                                 timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
        except tiktok_downloader.Except.InvalidUrl as e:
            status = 'Invalid URL'
            logger.warning(f'Invalid URL on {url}  {e}\n{traceback.format_exc()}')
-            return ArchiveResult(status=status)
+            return self.generateArchiveResult(status=status)
        except:
            error = traceback.format_exc()
            status = 'Other Tiktok error: ' + str(error)
            logger.warning(f'Other Tiktok error' + str(error))
-            return ArchiveResult(status=status)
+            return self.generateArchiveResult(status=status)
--- a/src/archivers/twitter_api_archiver.py
+++ b/src/archivers/twitter_api_archiver.py
@ -40,7 +40,7 @@ class TwitterApiArchiver(TwitterArchiver):
            # only s3 storage supports storage.exists as not implemented on gd
            cdn_url = self.storage.get_cdn_url(key)
            screenshot = self.get_screenshot(url)
-            return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
+            return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
        urls = []
        if tweet.includes:
@ -72,4 +72,4 @@ class TwitterApiArchiver(TwitterArchiver):
        screenshot = self.get_screenshot(url)
        wacz = self.get_wacz(url)
        page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
+        return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
--- a/src/archivers/twitter_archiver.py
+++ b/src/archivers/twitter_archiver.py
@ -41,7 +41,7 @@ class TwitterArchiver(Archiver):
            screenshot = self.get_screenshot(url)
            wacz = self.get_wacz(url)
            page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
-            return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
+            return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
        urls = []
@ -62,7 +62,7 @@ class TwitterArchiver(Archiver):
        screenshot = self.get_screenshot(url)
        wacz = self.get_wacz(url)
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
+        return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
    def download_alternative(self, url, tweet_id):
        # https://stackoverflow.com/a/71867055/6196010
@ -87,7 +87,7 @@ class TwitterArchiver(Archiver):
        screenshot = self.get_screenshot(url)
        wacz = self.get_wacz(url)
        page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
+        return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
    def choose_variant(self, variants):
        # choosing the highest quality possible
--- a/src/archivers/vk_archiver.py
+++ b/src/archivers/vk_archiver.py
@ -31,7 +31,7 @@ class VkArchiver(Archiver):
        # if check_if_exists and self.storage.exists(key):
        #     screenshot = self.get_screenshot(url)
        #     cdn_url = self.storage.get_cdn_url(key)
-        #     return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
+        #     return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
        results = self.vks.scrape(url)  # some urls can contain multiple wall/photo/... parts and all will be fetched
        if len(results) == 0:
@ -71,4 +71,4 @@ class VkArchiver(Archiver):
        # # if multiple wall/photos/videos are present the screenshot will only grab the 1st
        screenshot = self.get_screenshot(url)
        wacz = self.get_wacz(url)
-        return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
+        return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
--- a/src/archivers/wayback_archiver.py
+++ b/src/archivers/wayback_archiver.py
@ -39,7 +39,7 @@ class WaybackArchiver(Archiver):
        if r.status_code != 200:
            logger.warning(f"Internet archive failed with status of {r.status_code}")
-            return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
+            return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
        if 'job_id' not in r.json() and 'message' in r.json():
            return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
@ -61,7 +61,7 @@ class WaybackArchiver(Archiver):
            retries += 1
        if status_r.status_code != 200:
-            return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
+            return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
        status_json = status_r.json()
        if status_json['status'] != 'success':
@ -77,7 +77,7 @@ class WaybackArchiver(Archiver):
                title = 'Could not get title'
        except:
            title = "Could not get title"
-        self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
+        self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
        return self.seen_urls[url]
    def custom_retry(self, json_data, **kwargs):
@ -86,4 +86,4 @@ class WaybackArchiver(Archiver):
            return self.signal_retry_in(**kwargs)
        if "this host has been already captured" in str(json_data).lower():
            return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600)  # 24h to 36h later
-        return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
+        return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
--- a/src/archivers/youtubedl_archiver.py
+++ b/src/archivers/youtubedl_archiver.py
@ -38,7 +38,7 @@ class YoutubeDLArchiver(Archiver):
        if info.get('is_live', False):
            logger.warning("Live streaming media, not archiving now")
-            return ArchiveResult(status="Streaming media")
+            return self.generateArchiveResult(status="Streaming media")
        if 'twitter.com' in netloc:
            if 'https://twitter.com/' in info['webpage_url']:
@ -114,5 +114,5 @@ class YoutubeDLArchiver(Archiver):
        elif 'upload_date' in info and info['upload_date'] is not None:
            timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
-        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+        return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
                             title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
--- a/src/auto_archive.py
+++ b/src/auto_archive.py
@ -57,7 +57,7 @@ def missing_required_columns(gw: GWorksheet):
    return missing
-def should_process_sheet(c, sheet_name):
+def should_process_sheet(c: Config, sheet_name):
    if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
        # ALLOW rules exist AND sheet name not explicitly allowed
        return False
@ -67,6 +67,50 @@ def should_process_sheet(c, sheet_name):
    return True
 def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
    url = expand_url(url)
    c.set_folder(folder)
    storage = c.get_storage()
    # make a new driver so each spreadsheet row is idempotent
    c.recreate_webdriver()
    # order matters, first to succeed excludes remaining
    active_archivers = [
        TelethonArchiver(storage, c),
        TiktokArchiver(storage, c),
        TwitterApiArchiver(storage, c),
        InstagramArchiver(storage, c),
        YoutubeDLArchiver(storage, c),
        TelegramArchiver(storage, c),
        TwitterArchiver(storage, c),
        VkArchiver(storage, c),
        WaybackArchiver(storage, c)
    ]
    for archiver in active_archivers:
        logger.debug(f'Trying {archiver} on {debug_string}')
        try:
            result = archiver.download(url, check_if_exists=c.check_if_exists)
        except KeyboardInterrupt as e: raise e  # so the higher level catch can catch it
        except Exception as e:
            result = False
            logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
        if result:
            success = result.status in ['success', 'already archived']
            result.status = f"{archiver.name}: {result.status}"
            if success:
                logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
                break
            # only 1 retry possible for now
            if is_retry and Archiver.is_retry(result.status):
                result.status = Archiver.remove_retry(result.status)
            logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
    return result
 def process_sheet(c: Config):
    sh = c.gsheets_client.open(c.sheet)
@ -100,46 +144,7 @@ def process_sheet(c: Config):
            # All checks done - archival process starts here
            try:
                gw.set_cell(row, 'status', 'Archive in progress')
-                url = expand_url(url)
+                result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
                c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
                # make a new driver so each spreadsheet row is idempotent
                c.recreate_webdriver()
                # order matters, first to succeed excludes remaining
                active_archivers = [
                    TelethonArchiver(storage, c),
                    TiktokArchiver(storage, c),
                    TwitterApiArchiver(storage, c),
                    InstagramArchiver(storage, c),
                    YoutubeDLArchiver(storage, c),
                    TelegramArchiver(storage, c),
                    TwitterArchiver(storage, c),
                    VkArchiver(storage, c),
                    WaybackArchiver(storage, c)
                ]
                for archiver in active_archivers:
                    logger.debug(f'Trying {archiver} on {row=}')
                    try:
                        result = archiver.download(url, check_if_exists=c.check_if_exists)
                    except KeyboardInterrupt as e: raise e  # so the higher level catch can catch it
                    except Exception as e:
                        result = False
                        logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
                    if result:
                        success = result.status in ['success', 'already archived']
                        result.status = f"{archiver.name}: {result.status}"
                        if success:
                            logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
                            break
                        # only 1 retry possible for now
                        if is_retry and Archiver.is_retry(result.status):
                            result.status = Archiver.remove_retry(result.status)
                        logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
                if result:
                    update_sheet(gw, row, url, result)
                else:
--- a/src/cli.py
+++ b/src/cli.py
@ -0,0 +1,30 @@
 import tempfile, json
 import auto_archive
 from loguru import logger
 from configs import Config
 from storages import Storage
 from slugify import slugify
 def main():
    c = Config()
    c.parse()
    url = c.url
    if not url:
        logger.error("Invalid URL: '{url}'")
        return
    logger.info(f'Archiving "{url=}".')
    with tempfile.TemporaryDirectory(dir="./") as tmpdir:
        Storage.TMP_FOLDER = tmpdir
        result = auto_archive.archive_url(c, url, "", f"{url=}", False)
        c.destroy_webdriver()
    key = f"media_{slugify(url)}.json"
    with open(key, "w", encoding="utf-8") as outf:
        json.dump(result.media, outf, ensure_ascii=False, indent=4)
    c.get_storage().upload(key, key)
    print(result)
    return result
 if __name__ == "__main__":
    main()
--- a/src/configs/config.py
+++ b/src/configs/config.py
@ -47,6 +47,8 @@ class Config:
        with open(self.config_file, "r", encoding="utf-8") as inf:
            self.config = yaml.safe_load(inf)
        self.url = getattr_or(self.args, "url", '')
        # ----------------------EXECUTION - execution configurations
        execution = self.config.get("execution", {})
@ -211,6 +213,7 @@ class Config:
        """
        parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
        parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction')
        parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
        parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
        parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
--- a/src/storages/base_storage.py
+++ b/src/storages/base_storage.py
@ -1,3 +1,4 @@
 import os, uuid
 from loguru import logger
 from abc import ABC, abstractmethod
 from pathlib import Path
@ -18,6 +19,14 @@ class Storage(ABC):
    @abstractmethod
    def uploadf(self, file, key, **kwargs): pass
    def clean_key(self, key):
        # Some storages does not work well with trailing forward slashes and some keys come with that
        if key.startswith('/'):
            logger.debug(f'Found and fixed a leading "/" for {key=}')
            return key[1:]
        return key
    def upload(self, filename: str, key: str, **kwargs):
        logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
        with open(filename, 'rb') as f:
--- a/src/storages/gd_storage.py
+++ b/src/storages/gd_storage.py
@ -116,13 +116,6 @@ class GDStorage(Storage):
        # GD only requires the filename not a file reader
        self.uploadf(filename, key, **kwargs)
    def clean_key(self, key):
        # GDrive does not work well with trailing forward slashes and some keys come with that
        if key.startswith('/'):
            logger.debug(f'Found and fixed a leading "/" for {key=}')
            return key[1:]
        return key
    # gets the Drive folderID if it is there
    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
        """
--- a/src/storages/local_storage.py
+++ b/src/storages/local_storage.py
@ -1,6 +1,7 @@
 import os
 from dataclasses import dataclass
 from loguru import logger
 from .base_storage import Storage
 from utils import mkdir_if_not_exists
@ -18,8 +19,12 @@ class LocalStorage(Storage):
        mkdir_if_not_exists(self.save_to)
    def get_cdn_url(self, key):
        key = self.clean_key(key)
        logger.info(f"{key=}")
        full_path = os.path.join(self.save_to, self.folder, key)
-        mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
+        logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}")
        os.makedirs(os.path.dirname(full_path), exist_ok=True)
        # mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
        return os.path.abspath(full_path)
    def exists(self, key):