fix config parsing in manifests, remove module level configs

2025-01-24 13:33:12 +00:00 · 2025-01-24 13:33:12 +00:00 · 024fe58377
commit 024fe58377
--- a/src/auto_archiver/modules/atlos/atlos.py
+++ b/src/auto_archiver/modules/atlos/atlos.py
@ -15,10 +15,6 @@ class AtlosStorage(Storage):
    def __init__(self, config: dict) -> None:
        super().__init__(config)

-    @staticmethod
-    def configs() -> dict:
-        return dict(Storage.configs(), **get_atlos_config_options())
-
    def get_cdn_url(self, _media: Media) -> str:
        # It's not always possible to provide an exact URL, because it's
        # possible that the media once uploaded could have been copied to
--- a/src/auto_archiver/modules/atlos_db/atlos_db.py
+++ b/src/auto_archiver/modules/atlos_db/atlos_db.py
@ -22,11 +22,6 @@ class AtlosDb(Database):
        # without this STEP.__init__ is not called
        super().__init__(config)

-    # TODO
-    @staticmethod
-    def configs() -> dict:
-        return get_atlos_config_options()
-
    def failed(self, item: Metadata, reason: str) -> None:
        """Update DB accordingly for failure"""
        # If the item has no Atlos ID, there's nothing for us to do
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@ -15,11 +15,6 @@ class AtlosFeeder(Feeder):
        if type(self.api_token) != str:
            raise Exception("Atlos Feeder did not receive an Atlos API token")

-    # TODO
-    @staticmethod
-    def configs() -> dict:
-        return get_atlos_config_options()
-
    def __iter__(self) -> Metadata:
        # Get all the urls from the Atlos API
        count = 0
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@ -13,16 +13,6 @@ class CLIFeeder(Feeder):
        if type(self.urls) != list or len(self.urls) == 0:
            raise Exception("CLI Feeder did not receive any URL to process")

-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "urls": {
-    #             "default": None,
-    #             "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-    #             "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
-    #         },
-    #     }
-
    def __iter__(self) -> Metadata:
        for url in self.urls:
            logger.debug(f"Processing {url}")
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@ -9,23 +9,6 @@ class CSVFeeder(Feeder):

    name = "csv_feeder"

-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "files": {
-                "default": None,
-                "help": "Path to the input file(s) to read the URLs from, comma separated. \
-                        Input files should be formatted with one URL per line",
-                "type": "auto_archiver.utils.parse_csv_to_set",
-            },
-            "column": {
-                "default": None,
-                "help": "Column number or name to read the URLs from, 0-indexed",
-            }
-        }
-    
-
    def __iter__(self) -> Metadata:
        url_column = self.column or 0
        for file in self.files:
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@ -58,16 +58,6 @@ class GDriveStorage(Storage):

        self.service = build('drive', 'v3', credentials=creds)

-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Storage.configs(),
-            ** {
-                "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
-                "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
-                "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
-            })
-
    def get_cdn_url(self, media: Media) -> str:
        """
        only support files saved in a folder for GD
--- a/src/auto_archiver/modules/gsheet_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_db/manifest.py
@ -14,7 +14,7 @@
        "block_worksheets": {
            "default": set(),
            "help": "(CSV) explicitly block some worksheets from being processed",
-            "type": auto_archiver.utils.parse_csv_to_set,
+            "type": "auto_archiver.utils.parse_csv_to_set",
        },
        "use_sheet_names_in_stored_paths": {
            "default": True,
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@ -26,27 +26,6 @@ class GsheetsFeeder(Gsheets, Feeder):
        super().__init__(config)
        self.gsheets_client = gspread.service_account(filename=self.service_account)

-    # @staticmethod
-    # def configs() -> dict:
-    #     return dict(
-    #         Gsheets.configs(),
-    #         ** {
-    #             "allow_worksheets": {
-    #                 "default": set(),
-    #                 "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
-    #                 "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-    #             },
-    #             "block_worksheets": {
-    #                 "default": set(),
-    #                 "help": "(CSV) explicitly block some worksheets from being processed",
-    #                 "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-    #             },
-    #             "use_sheet_names_in_stored_paths": {
-    #                 "default": True,
-    #                 "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
-    #             }
-    #         })
-
    def __iter__(self) -> Metadata:
        sh = self.open_sheet()
        for ii, wks in enumerate(sh.worksheets()):
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@ -28,12 +28,6 @@ class HtmlFormatter(Formatter):
        })
        self.template = self.environment.get_template("html_template.html")

-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
-    #     }
-
    def format(self, item: Metadata) -> Media:
        url = item.get_url()
        if item.is_empty():
--- a/src/auto_archiver/modules/local_storage/local.py
+++ b/src/auto_archiver/modules/local_storage/local.py
@ -15,15 +15,6 @@ class LocalStorage(Storage):
        super().__init__(config)
        os.makedirs(self.save_to, exist_ok=True)

-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Storage.configs(),
-            ** {
-                "save_to": {"default": "./archived", "help": "folder where to save archived content"},
-                "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
-            })
-
    def get_cdn_url(self, media: Media) -> str:
        # TODO: is this viable with Storage.configs on path/filename?
        dest = os.path.join(self.save_to, media.key)
--- a/src/auto_archiver/modules/s3_storage/s3.py
+++ b/src/auto_archiver/modules/s3_storage/s3.py
@ -26,27 +26,6 @@ class S3Storage(Storage):
        if self.random_no_duplicate:
            logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")

-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Storage.configs(),
-            ** {
-                "bucket": {"default": None, "help": "S3 bucket name"},
-                "region": {"default": None, "help": "S3 region name"},
-                "key": {"default": None, "help": "S3 API key"},
-                "secret": {"default": None, "help": "S3 API secret"},
-                "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
-                "endpoint_url": {
-                    "default": 'https://{region}.digitaloceanspaces.com',
-                    "help": "S3 bucket endpoint, {region} are inserted at runtime"
-                },
-                "cdn_url": {
-                    "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
-                    "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
-                },
-                "private": {"default": False, "help": "if true S3 files will not be readable online"},
-            })
-
    def get_cdn_url(self, media: Media) -> str:
        return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)

--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@ -14,21 +14,6 @@ class ScreenshotEnricher(Enricher):

    def __init__(self, config: dict) -> None:
        super().__init__(config)
-    #     TODO?
-
-
-
-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "width": {"default": 1280, "help": "width of the screenshots"},
-    #         "height": {"default": 720, "help": "height of the screenshots"},
-    #         "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
-    #         "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
-    #         "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
-    #         "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
-    #         "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
-    #     }

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
--- a/src/auto_archiver/modules/timestamping_enricher/manifest.py
+++ b/src/auto_archiver/modules/timestamping_enricher/manifest.py
@ -15,13 +15,28 @@
    "configs": {
        "tsa_urls": {
            "default": [
-                "http://timestamp.digicert.com",
-                "http://timestamp.identrust.com",
-                "http://timestamp.globalsign.com/tsa/r6advanced1",
-                "http://tss.accv.es:8318/tsa"
-            ],
+                    # [Adobe Approved Trust List] and [Windows Cert Store]
+                    "http://timestamp.digicert.com",
+                    "http://timestamp.identrust.com",
+                    # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
+                    # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
+
+                    # [Adobe: European Union Trusted Lists].
+                    # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
+
+                    # [Windows Cert Store]
+                    "http://timestamp.globalsign.com/tsa/r6advanced1",
+                    # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
+                    # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
+                    # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
+                    # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
+                    # "http://tsa.sep.bg", # self-signed certificate in certificate chain
+                    # "http://tsa.izenpe.com", #unable to get local issuer certificate
+                    # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
+                    "http://tss.accv.es:8318/tsa",
+                ],
            "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
-            "type": auto_archiver.utils.parse_csv_to_set,
+            "type": "auto_archiver.utils.parse_csv_to_set",
        }
    },
    "description": """
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@ -26,37 +26,6 @@ class TimestampingEnricher(Enricher):
    def __init__(self, config: dict) -> None:
        super().__init__(config)

-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "tsa_urls": {
-    #             "default": [
-    #                 # [Adobe Approved Trust List] and [Windows Cert Store]
-    #                 "http://timestamp.digicert.com",
-    #                 "http://timestamp.identrust.com",
-    #                 # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
-    #                 # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
-    #
-    #                 # [Adobe: European Union Trusted Lists].
-    #                 # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
-    #
-    #                 # [Windows Cert Store]
-    #                 "http://timestamp.globalsign.com/tsa/r6advanced1",
-    #
-    #                 # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
-    #                 # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
-    #                 # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
-    #                 # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
-    #                 # "http://tsa.sep.bg", # self-signed certificate in certificate chain
-    #                 # "http://tsa.izenpe.com", #unable to get local issuer certificate
-    #                 # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
-    #                 "http://tss.accv.es:8318/tsa",
-    #             ],
-    #             "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
-    #             "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-    #         }
-    #     }
-
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
        logger.debug(f"RFC3161 timestamping existing files for {url=}")
--- a/src/auto_archiver/modules/twitter_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/manifest.py
@ -12,7 +12,7 @@
    "configs": {
            "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
            "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
-                              "type": auto_archiver.utils.parse_csv_to_set,},
+                              "type": "auto_archiver.utils.parse_csv_to_set",},
            "consumer_key": {"default": None, "help": "twitter API consumer_key"},
            "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
            "access_token": {"default": None, "help": "twitter API access_token"},
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
@ -16,35 +16,6 @@ class Gsheets(Step):
        assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
        assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."

-    @staticmethod
-    def configs() -> dict:
-        return {
-            "sheet": {"default": None, "help": "name of the sheet to archive"},
-            "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
-            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
-            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
-            "columns": {
-                "default": {
-                    'url': 'link',
-                    'status': 'archive status',
-                    'folder': 'destination folder',
-                    'archive': 'archive location',
-                    'date': 'archive date',
-                    'thumbnail': 'thumbnail',
-                    'timestamp': 'upload timestamp',
-                    'title': 'upload title',
-                    'text': 'text content',
-                    'screenshot': 'screenshot',
-                    'hash': 'hash',
-                    'pdq_hash': 'perceptual hashes',
-                    'wacz': 'wacz',
-                    'replaywebpage': 'replaywebpage',
-                },
-                "help": "names of columns in the google sheet (stringified JSON object)",
-                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
-            },
-        }
-
    def open_sheet(self):
        if self.sheet:
            return self.gsheets_client.open(self.sheet)