diff --git a/src/auto_archiver/base_modules/__init__.py b/src/auto_archiver/base_processors/__init__.py similarity index 100% rename from src/auto_archiver/base_modules/__init__.py rename to src/auto_archiver/base_processors/__init__.py diff --git a/src/auto_archiver/base_modules/database.py b/src/auto_archiver/base_processors/database.py similarity index 100% rename from src/auto_archiver/base_modules/database.py rename to src/auto_archiver/base_processors/database.py diff --git a/src/auto_archiver/base_modules/enricher.py b/src/auto_archiver/base_processors/enricher.py similarity index 100% rename from src/auto_archiver/base_modules/enricher.py rename to src/auto_archiver/base_processors/enricher.py diff --git a/src/auto_archiver/base_modules/extractor.py b/src/auto_archiver/base_processors/extractor.py similarity index 100% rename from src/auto_archiver/base_modules/extractor.py rename to src/auto_archiver/base_processors/extractor.py diff --git a/src/auto_archiver/base_modules/feeder.py b/src/auto_archiver/base_processors/feeder.py similarity index 100% rename from src/auto_archiver/base_modules/feeder.py rename to src/auto_archiver/base_processors/feeder.py diff --git a/src/auto_archiver/base_modules/formatter.py b/src/auto_archiver/base_processors/formatter.py similarity index 100% rename from src/auto_archiver/base_modules/formatter.py rename to src/auto_archiver/base_processors/formatter.py diff --git a/src/auto_archiver/base_modules/storage.py b/src/auto_archiver/base_processors/storage.py similarity index 100% rename from src/auto_archiver/base_modules/storage.py rename to src/auto_archiver/base_processors/storage.py diff --git a/src/auto_archiver/modules/api_db/__init__.py b/src/auto_archiver/modules/api_db/__init__.py index e69de29..2070b06 100644 --- a/src/auto_archiver/modules/api_db/__init__.py +++ b/src/auto_archiver/modules/api_db/__init__.py @@ -0,0 +1 @@ +from api_db import AAApiDb \ No newline at end of file diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index a55f26c..c422b49 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -15,7 +15,9 @@ "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, + "tags": {"default": [], "help": "what tags to add to the archived URL", + "type": lambda val: set(val.split(",")), + } }, "description": """ Provides integration with the Auto-Archiver API for querying and storing archival data. diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index 44373c6..d2b43b7 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import requests, os from loguru import logger -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/atlos/__init__.py b/src/auto_archiver/modules/atlos/__init__.py index e69de29..de7fead 100644 --- a/src/auto_archiver/modules/atlos/__init__.py +++ b/src/auto_archiver/modules/atlos/__init__.py @@ -0,0 +1 @@ +from .atlos import AtlosStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py index c600e43..ec356a5 100644 --- a/src/auto_archiver/modules/atlos/__manifest__.py +++ b/src/auto_archiver/modules/atlos/__manifest__.py @@ -15,12 +15,12 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val, + "type": str, }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val, + "type": str, }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos/atlos.py index 28b7cb1..0b16714 100644 --- a/src/auto_archiver/modules/atlos/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -5,7 +5,7 @@ import requests import hashlib from auto_archiver.core import Media, Metadata -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/atlos_db/__init__.py b/src/auto_archiver/modules/atlos_db/__init__.py index e69de29..1552e39 100644 --- a/src/auto_archiver/modules/atlos_db/__init__.py +++ b/src/auto_archiver/modules/atlos_db/__init__.py @@ -0,0 +1 @@ +from atlos_db import AtlosDb \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py index 470d07d..941206f 100644 --- a/src/auto_archiver/modules/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -11,12 +11,11 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val + "type": str }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index cbf1c89..c1d20a1 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -6,7 +6,7 @@ from csv import DictWriter from dataclasses import asdict import requests -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/atlos_db/base_configs.py b/src/auto_archiver/modules/atlos_db/base_configs.py index c47c711..f672f82 100644 --- a/src/auto_archiver/modules/atlos_db/base_configs.py +++ b/src/auto_archiver/modules/atlos_db/base_configs.py @@ -3,11 +3,11 @@ def get_atlos_config_options(): "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val + "type": str }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val + "type": str }, } \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/__init__.py b/src/auto_archiver/modules/atlos_feeder/__init__.py index e69de29..67b243a 100644 --- a/src/auto_archiver/modules/atlos_feeder/__init__.py +++ b/src/auto_archiver/modules/atlos_feeder/__init__.py @@ -0,0 +1 @@ +from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py index f0b216b..91fed32 100644 --- a/src/auto_archiver/modules/atlos_feeder/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -9,12 +9,12 @@ "api_token": { "default": None, "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "cli_set": lambda cli_val, _: cli_val + "type": str }, "atlos_url": { "default": "https://platform.atlos.org", "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "cli_set": lambda cli_val, _: cli_val + "type": str }, }, "description": """ diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index 0810b73..8a4a31a 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import requests -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/cli_feeder/__init__.py b/src/auto_archiver/modules/cli_feeder/__init__.py index e69de29..9c85787 100644 --- a/src/auto_archiver/modules/cli_feeder/__init__.py +++ b/src/auto_archiver/modules/cli_feeder/__init__.py @@ -0,0 +1 @@ +from .cli_feeder import CLIFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index fcb9099..2e2c53e 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -9,7 +9,7 @@ "urls": { "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + "type": lambda val: set(val.split(",")), }, }, "description": """ diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index e826533..3380f90 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/console_db/__init__.py b/src/auto_archiver/modules/console_db/__init__.py index e69de29..343f09c 100644 --- a/src/auto_archiver/modules/console_db/__init__.py +++ b/src/auto_archiver/modules/console_db/__init__.py @@ -0,0 +1 @@ +from .console_db import ConsoleDb \ No newline at end of file diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index a0d43b7..9dfeb2c 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/__init__.py b/src/auto_archiver/modules/csv_db/__init__.py index e69de29..1092cb2 100644 --- a/src/auto_archiver/modules/csv_db/__init__.py +++ b/src/auto_archiver/modules/csv_db/__init__.py @@ -0,0 +1 @@ +from .csv_db import CSVDb \ No newline at end of file diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index 6e5d873..eec4ec6 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,7 +3,7 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_feeder/__init__.py b/src/auto_archiver/modules/csv_feeder/__init__.py index e69de29..161b78d 100644 --- a/src/auto_archiver/modules/csv_feeder/__init__.py +++ b/src/auto_archiver/modules/csv_feeder/__init__.py @@ -0,0 +1 @@ +from .csv_feeder import CSVFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index ad5d40b..fb644ec 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -11,7 +11,7 @@ "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + "type": lambda val: set(val.split(",")), }, "column": { "default": None, diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 4cf2f11..a830791 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import csv -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import url_or_none @@ -17,7 +17,7 @@ class CSVFeeder(Feeder): "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + "type": lambda val: set(val.split(",")), }, "column": { "default": None, diff --git a/src/auto_archiver/modules/gdrive_storage/__init__.py b/src/auto_archiver/modules/gdrive_storage/__init__.py index e69de29..2765e4b 100644 --- a/src/auto_archiver/modules/gdrive_storage/__init__.py +++ b/src/auto_archiver/modules/gdrive_storage/__init__.py @@ -0,0 +1 @@ +from .gdrive_storage import GDriveStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py index e7e4650..b81b717 100644 --- a/src/auto_archiver/modules/gdrive_storage/__manifest__.py +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "Google Drive Storage", "type": ["storage"], "requires_setup": True, @@ -12,15 +12,16 @@ m = { ], }, "configs": { - "path_generator": { - "default": "url", - "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", - }, - "filename_generator": { - "default": "random", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", - }, - # TODO: get base storage configs + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + "choices": ["flat", "url", "random"], + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "choices": ["random", "static"], + }, "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 2e4ca48..652ff91 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -10,7 +10,7 @@ from google.oauth2.credentials import Credentials from google.auth.transport.requests import Request from auto_archiver.core import Media -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage class GDriveStorage(Storage): diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index d4051aa..c75c373 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/generic_extractor/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py index 9de63d2..99cd71b 100644 --- a/src/auto_archiver/modules/generic_extractor/dropin.py +++ b/src/auto_archiver/modules/generic_extractor/dropin.py @@ -1,6 +1,6 @@ from yt_dlp.extractor.common import InfoExtractor from auto_archiver.core.metadata import Metadata -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor class GenericDropin: """Base class for dropins for the generic extractor. diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 8e4b2c4..ff9f8b4 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -5,7 +5,7 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from ...core import Metadata, Media, ArchivingContext class GenericExtractor(Extractor): diff --git a/src/auto_archiver/modules/generic_extractor/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py index e713c90..f52a748 100644 --- a/src/auto_archiver/modules/generic_extractor/truth.py +++ b/src/auto_archiver/modules/generic_extractor/truth.py @@ -2,7 +2,7 @@ from typing import Type from auto_archiver.utils import traverse_obj from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from yt_dlp.extractor.common import InfoExtractor from dateutil.parser import parse as parse_dt diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 6cd22b1..11399d4 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -6,7 +6,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import UrlUtil -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gsheet_db/__init__.py new file mode 100644 index 0000000..01fdee6 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_db/__init__.py @@ -0,0 +1 @@ +from .gsheet_db import GsheetsDb \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py new file mode 100644 index 0000000..df7fb6a --- /dev/null +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -0,0 +1,38 @@ +{ + "name": "Google Sheets Database", + "type": ["database"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "gspread", "python-slugify"], + }, + "configs": { + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "type": lambda val: set(val.split(",")), + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed", + "type": lambda val: set(val.split(",")), + }, + "use_sheet_names_in_stored_paths": { + "default": True, + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + } + }, + "description": """ + GsheetsDatabase: + Handles integration with Google Sheets for tracking archival tasks. + +### Features +- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. +- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. +- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. +- Skips redundant updates for empty or invalid data fields. + +### Notes +- Currently works only with metadata provided by GsheetFeeder. +- Requires configuration of a linked Google Sheet and appropriate API credentials. + """ +} diff --git a/src/auto_archiver/modules/gsheet_processor/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py similarity index 98% rename from src/auto_archiver/modules/gsheet_processor/gsheet_db.py rename to src/auto_archiver/modules/gsheet_db/gsheet_db.py index cf46473..9ed3642 100644 --- a/src/auto_archiver/modules/gsheet_processor/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -5,7 +5,7 @@ from urllib.parse import quote from loguru import logger -from auto_archiver.base_modules import Database +from auto_archiver.base_processors import Database from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import GWorksheet @@ -105,5 +105,4 @@ class GsheetsDb(Database): elif self.sheet_id: print(self.sheet_id) - return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py new file mode 100644 index 0000000..f122bb2 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder/__init__.py @@ -0,0 +1 @@ +from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_processor/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py similarity index 63% rename from src/auto_archiver/modules/gsheet_processor/__manifest__.py rename to src/auto_archiver/modules/gsheet_feeder/__manifest__.py index 8a554fe..c6790ca 100644 --- a/src/auto_archiver/modules/gsheet_processor/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -9,12 +9,12 @@ "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + "type": lambda val: set(val.split(",")), }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + "type": lambda val: set(val.split(",")), }, "use_sheet_names_in_stored_paths": { "default": True, @@ -22,11 +22,7 @@ } }, "description": """ - Google Sheets Module. - - Handles feeding from a google sheet as well as an optional write back to the sheet. - - ## GsheetsFeeder + GsheetsFeeder A Google Sheets-based feeder for the Auto Archiver. This reads data from Google Sheets and filters rows based on user-defined rules. @@ -41,18 +37,5 @@ ### Notes - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - Create the sheet using the template provided in the docs. - - ## GsheetsDatabase: - Handles integration with Google Sheets for tracking archival tasks. - -### Features -- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. -- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. -- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. -- Skips redundant updates for empty or invalid data fields. - -### Notes -- Currently works only with metadata provided by GsheetFeeder. -- Requires configuration of a linked Google Sheet and appropriate API credentials. """ } diff --git a/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py similarity index 98% rename from src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py rename to src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 4df9042..a417615 100644 --- a/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -13,7 +13,7 @@ import gspread, os from loguru import logger from slugify import slugify -from auto_archiver.base_modules import Feeder +from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import Gsheets, GWorksheet diff --git a/src/auto_archiver/modules/gsheet_processor/__init__.py b/src/auto_archiver/modules/gsheet_processor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/auto_archiver/modules/hash_enricher/__init__.py b/src/auto_archiver/modules/hash_enricher/__init__.py index e7faff7..18ec885 100644 --- a/src/auto_archiver/modules/hash_enricher/__init__.py +++ b/src/auto_archiver/modules/hash_enricher/__init__.py @@ -1 +1 @@ -from hash_enricher import HashEnricher \ No newline at end of file +from .hash_enricher import HashEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index c8eacb1..8731b06 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,7 +10,7 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/html_formatter/__init__.py b/src/auto_archiver/modules/html_formatter/__init__.py index e69de29..432ef33 100644 --- a/src/auto_archiver/modules/html_formatter/__init__.py +++ b/src/auto_archiver/modules/html_formatter/__init__.py @@ -0,0 +1 @@ +from .html_formatter import HtmlFormatter \ No newline at end of file diff --git a/src/auto_archiver/modules/html_formatter/__manifest__.py b/src/auto_archiver/modules/html_formatter/__manifest__.py index 55ca5da..259a3d1 100644 --- a/src/auto_archiver/modules/html_formatter/__manifest__.py +++ b/src/auto_archiver/modules/html_formatter/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "HTML Formatter", "type": ["formatter"], "requires_setup": False, diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index cc8a4da..a1951f3 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -9,7 +9,7 @@ import base64 from auto_archiver.version import __version__ from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.base_modules import Formatter +from auto_archiver.base_processors import Formatter from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/instagram_api_extractor/__init__.py b/src/auto_archiver/modules/instagram_api_extractor/__init__.py index e69de29..068b8c6 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__init__.py @@ -0,0 +1 @@ +from .instagram_api_archiver import InstagramAPIExtractor diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py index 5206b41..c1271fc 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py @@ -16,7 +16,7 @@ from loguru import logger from retrying import retry from tqdm import tqdm -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Media from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/instagram_extractor/__init__.py b/src/auto_archiver/modules/instagram_extractor/__init__.py index e69de29..37ec56c 100644 --- a/src/auto_archiver/modules/instagram_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_extractor/__init__.py @@ -0,0 +1 @@ +from .instagram_archiver import InstagramExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py index c6bde62..2b9bece 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py @@ -7,7 +7,7 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata from auto_archiver.core import Media diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py index e69de29..1b4dbc3 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py @@ -0,0 +1 @@ +from .instagram_tbot_archiver import InstagramTbotExtractor diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py index 5c3ad24..36c8a06 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py @@ -15,7 +15,7 @@ from sqlite3 import OperationalError from loguru import logger from telethon.sync import TelegramClient -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str diff --git a/src/auto_archiver/modules/local_storage/__init__.py b/src/auto_archiver/modules/local_storage/__init__.py index e69de29..6746373 100644 --- a/src/auto_archiver/modules/local_storage/__init__.py +++ b/src/auto_archiver/modules/local_storage/__init__.py @@ -0,0 +1 @@ +from .local import LocalStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/local_storage/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py index 7247885..c012be0 100644 --- a/src/auto_archiver/modules/local_storage/__manifest__.py +++ b/src/auto_archiver/modules/local_storage/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "Local Storage", "type": ["storage"], "requires_setup": False, @@ -9,10 +9,12 @@ m = { "path_generator": { "default": "url", "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + "choices": ["flat", "url", "random"], }, "filename_generator": { "default": "random", "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "choices": ["random", "static"], }, "save_to": {"default": "./archived", "help": "folder where to save archived content"}, "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, diff --git a/src/auto_archiver/modules/local_storage/local.py b/src/auto_archiver/modules/local_storage/local.py index ef0966d..cac692e 100644 --- a/src/auto_archiver/modules/local_storage/local.py +++ b/src/auto_archiver/modules/local_storage/local.py @@ -5,7 +5,7 @@ import os from loguru import logger from auto_archiver.core import Media -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage class LocalStorage(Storage): diff --git a/src/auto_archiver/modules/meta_enricher/__init__.py b/src/auto_archiver/modules/meta_enricher/__init__.py index e69de29..4e1d330 100644 --- a/src/auto_archiver/modules/meta_enricher/__init__.py +++ b/src/auto_archiver/modules/meta_enricher/__init__.py @@ -0,0 +1 @@ +from .meta_enricher import MetaEnricher diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index 52d8eb2..f9b74f7 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,7 +2,7 @@ import datetime import os from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/__init__.py b/src/auto_archiver/modules/metadata_enricher/__init__.py index e69de29..020bd4a 100644 --- a/src/auto_archiver/modules/metadata_enricher/__init__.py +++ b/src/auto_archiver/modules/metadata_enricher/__init__.py @@ -0,0 +1 @@ +from .metadata_enricher import MetadataEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index b729d36..cb68b98 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,7 +2,7 @@ import subprocess import traceback from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/mute_formatter/__init__.py b/src/auto_archiver/modules/mute_formatter/__init__.py index e69de29..b92fce9 100644 --- a/src/auto_archiver/modules/mute_formatter/__init__.py +++ b/src/auto_archiver/modules/mute_formatter/__init__.py @@ -0,0 +1 @@ +from .mute_formatter import MuteFormatter diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__init__.py b/src/auto_archiver/modules/pdq_hash_enricher/__init__.py index e69de29..b444197 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/__init__.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/__init__.py @@ -0,0 +1 @@ +from .pdq_hash_enricher import PdqHashEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index dc70465..7e3f467 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,7 +16,7 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3_storage/__init__.py b/src/auto_archiver/modules/s3_storage/__init__.py index e69de29..1c826fd 100644 --- a/src/auto_archiver/modules/s3_storage/__init__.py +++ b/src/auto_archiver/modules/s3_storage/__init__.py @@ -0,0 +1 @@ +from .s3 import S3Storage \ No newline at end of file diff --git a/src/auto_archiver/modules/s3_storage/__manifest__.py b/src/auto_archiver/modules/s3_storage/__manifest__.py index 210eefa..fc41eb3 100644 --- a/src/auto_archiver/modules/s3_storage/__manifest__.py +++ b/src/auto_archiver/modules/s3_storage/__manifest__.py @@ -1,4 +1,4 @@ -m = { +{ "name": "S3 Storage", "type": ["storage"], "requires_setup": True, @@ -6,29 +6,31 @@ m = { "python": ["boto3", "loguru"], }, "configs": { - "path_generator": { - "default": "url", - "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", - }, - "filename_generator": { - "default": "random", - "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", - }, - "bucket": {"default": None, "help": "S3 bucket name"}, - "region": {"default": None, "help": "S3 region name"}, - "key": {"default": None, "help": "S3 API key"}, - "secret": {"default": None, "help": "S3 API secret"}, - "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, - "endpoint_url": { - "default": 'https://{region}.digitaloceanspaces.com', - "help": "S3 bucket endpoint, {region} are inserted at runtime" - }, - "cdn_url": { - "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', - "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" - }, - "private": {"default": False, "help": "if true S3 files will not be readable online"}, - }, + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + "choices": ["flat", "url", "random"], + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "choices": ["random", "static"], + }, + "bucket": {"default": None, "help": "S3 bucket name"}, + "region": {"default": None, "help": "S3 region name"}, + "key": {"default": None, "help": "S3 API key"}, + "secret": {"default": None, "help": "S3 API secret"}, + "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, + "endpoint_url": { + "default": 'https://{region}.digitaloceanspaces.com', + "help": "S3 bucket endpoint, {region} are inserted at runtime" + }, + "cdn_url": { + "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', + "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" + }, + "private": {"default": False, "help": "if true S3 files will not be readable online"}, + }, "description": """ S3Storage: A storage module for saving media files to an S3-compatible object storage. diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3.py index 02b0613..fe221d0 100644 --- a/src/auto_archiver/modules/s3_storage/s3.py +++ b/src/auto_archiver/modules/s3_storage/s3.py @@ -4,7 +4,7 @@ import boto3, os from auto_archiver.utils.misc import random_str from auto_archiver.core import Media -from auto_archiver.base_modules import Storage +from auto_archiver.base_processors import Storage # TODO from auto_archiver.modules.hash_enricher import HashEnricher from loguru import logger diff --git a/src/auto_archiver/modules/screenshot_enricher/__init__.py b/src/auto_archiver/modules/screenshot_enricher/__init__.py index e69de29..393f726 100644 --- a/src/auto_archiver/modules/screenshot_enricher/__init__.py +++ b/src/auto_archiver/modules/screenshot_enricher/__init__.py @@ -0,0 +1 @@ +from .screenshot_enricher import ScreenshotEnricher diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index f99c100..626cd1f 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,7 +5,7 @@ import base64 from selenium.common.exceptions import TimeoutException -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str from auto_archiver.core import Media, Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/ssl_enricher/__init__.py b/src/auto_archiver/modules/ssl_enricher/__init__.py index e69de29..23d2bee 100644 --- a/src/auto_archiver/modules/ssl_enricher/__init__.py +++ b/src/auto_archiver/modules/ssl_enricher/__init__.py @@ -0,0 +1 @@ +from .ssl_enricher import SSLEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index aba1d33..965f699 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,7 +3,7 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media diff --git a/src/auto_archiver/modules/telegram_extractor/__init__.py b/src/auto_archiver/modules/telegram_extractor/__init__.py index e69de29..1fd80c2 100644 --- a/src/auto_archiver/modules/telegram_extractor/__init__.py +++ b/src/auto_archiver/modules/telegram_extractor/__init__.py @@ -0,0 +1 @@ +from .telegram_extractor import TelegramExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index 047d424..31bdaca 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,7 +2,7 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py index e69de29..424792f 100644 --- a/src/auto_archiver/modules/telethon_extractor/__init__.py +++ b/src/auto_archiver/modules/telethon_extractor/__init__.py @@ -0,0 +1 @@ +from .telethon_archiver import TelethonArchiver \ No newline at end of file diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index 6f09ea6..bb49882 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -1,4 +1,4 @@ -# TODO rm dependency on json +import json { "name": "telethon_extractor", "type": ["extractor"], @@ -19,8 +19,7 @@ "channel_invites": { "default": {}, "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", - # TODO - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + "type": lambda x: json.loads(x), } }, "description": """ diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py index 811a280..8b49a10 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py @@ -8,7 +8,7 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str diff --git a/src/auto_archiver/modules/thumbnail_enricher/__init__.py b/src/auto_archiver/modules/thumbnail_enricher/__init__.py index e69de29..fe20719 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/__init__.py +++ b/src/auto_archiver/modules/thumbnail_enricher/__init__.py @@ -0,0 +1 @@ +from .thumbnail_enricher import ThumbnailEnricher diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index a16d84a..8c34502 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Media, Metadata, ArchivingContext from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/timestamping_enricher/__init__.py b/src/auto_archiver/modules/timestamping_enricher/__init__.py index e69de29..62d358a 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__init__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__init__.py @@ -0,0 +1 @@ +from .timestamping_enricher import TimestampingEnricher diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index a66cc31..b49b61b 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -21,7 +21,7 @@ "http://tss.accv.es:8318/tsa" ], "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + "type": lambda val: set(val.split(",")), } }, "description": """ diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 473f880..0e159fa 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor class TimestampingEnricher(Enricher): diff --git a/src/auto_archiver/modules/twitter_api_extractor/__init__.py b/src/auto_archiver/modules/twitter_api_extractor/__init__.py index e69de29..cea3872 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__init__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__init__.py @@ -0,0 +1 @@ +from .twitter_api_archiver import TwitterApiExtractor \ No newline at end of file diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index ae1b0ff..0a314b5 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -12,7 +12,7 @@ "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, + "type": lambda val: set(val.split(",")),}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py index c5d03e0..ea669b4 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py @@ -8,7 +8,7 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata,Media class TwitterApiExtractor(Extractor): diff --git a/src/auto_archiver/modules/vk_extractor/__init__.py b/src/auto_archiver/modules/vk_extractor/__init__.py index e69de29..29fe59d 100644 --- a/src/auto_archiver/modules/vk_extractor/__init__.py +++ b/src/auto_archiver/modules/vk_extractor/__init__.py @@ -0,0 +1 @@ +from .vk_archiver import VkExtractor diff --git a/src/auto_archiver/modules/vk_extractor/vk_archiver.py b/src/auto_archiver/modules/vk_extractor/vk_archiver.py index 2474769..eb4c171 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_archiver.py +++ b/src/auto_archiver/modules/vk_extractor/vk_archiver.py @@ -2,7 +2,7 @@ from loguru import logger from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload -from auto_archiver.base_modules import Extractor +from auto_archiver.base_processors import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext diff --git a/src/auto_archiver/modules/wacz_enricher/__init__.py b/src/auto_archiver/modules/wacz_enricher/__init__.py index e69de29..686b8d8 100644 --- a/src/auto_archiver/modules/wacz_enricher/__init__.py +++ b/src/auto_archiver/modules/wacz_enricher/__init__.py @@ -0,0 +1 @@ +from .wacz_enricher import WaczExtractorEnricher diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 3eb2b17..cd52b67 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -6,7 +6,7 @@ from loguru import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata, ArchivingContext -from auto_archiver.base_modules import Extractor, Enricher +from auto_archiver.base_processors import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str diff --git a/src/auto_archiver/modules/wayback_enricher/__init__.py b/src/auto_archiver/modules/wayback_enricher/__init__.py index e69de29..9782831 100644 --- a/src/auto_archiver/modules/wayback_enricher/__init__.py +++ b/src/auto_archiver/modules/wayback_enricher/__init__.py @@ -0,0 +1 @@ +from .wayback_enricher import WaybackExtractorEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index bcd2450..6942727 100644 --- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,7 +2,7 @@ import json from loguru import logger import time, requests -from auto_archiver.base_modules import Extractor, Enricher +from auto_archiver.base_processors import Extractor, Enricher from auto_archiver.utils import UrlUtil from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/whisper_enricher/__init__.py b/src/auto_archiver/modules/whisper_enricher/__init__.py index e69de29..d3d3526 100644 --- a/src/auto_archiver/modules/whisper_enricher/__init__.py +++ b/src/auto_archiver/modules/whisper_enricher/__init__.py @@ -0,0 +1 @@ +from .whisper_enricher import WhisperEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a00ba25..d14c537 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,9 +2,9 @@ import traceback import requests, time from loguru import logger -from auto_archiver.base_modules import Enricher +from auto_archiver.base_processors import Enricher from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.modules import S3Storage +from auto_archiver.modules.s3_storage import S3Storage class WhisperEnricher(Enricher): diff --git a/tests/archivers/test_archiver_base.py b/tests/archivers/test_archiver_base.py index 721812a..6223879 100644 --- a/tests/archivers/test_archiver_base.py +++ b/tests/archivers/test_archiver_base.py @@ -1,7 +1,7 @@ import pytest from auto_archiver.core.metadata import Metadata -from auto_archiver.base_modules.extractor import Extractor +from auto_archiver.base_processors.extractor import Extractor class TestArchiverBase(object): archiver_class: str = None