kopia lustrzana https://github.com/bellingcat/auto-archiver
config refactor and cleanup
rodzic
c679e02c73
commit
a2fdfacb26
|
@ -3,13 +3,19 @@ import argparse, json
|
|||
import gspread
|
||||
from loguru import logger
|
||||
from selenium import webdriver
|
||||
from storages.local_storage import LocalStorage
|
||||
from dataclasses import dataclass
|
||||
|
||||
from utils.gworksheet import GWorksheet
|
||||
from storages import S3Config, S3Storage
|
||||
from .wayback_config import WaybackConfig
|
||||
from .telegram_config import TelegramConfig
|
||||
from storages import Storage
|
||||
from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeleniumConfig:
|
||||
timeout_seconds: int = 120
|
||||
window_width: int = 1400
|
||||
window_height: int = 2000
|
||||
|
||||
|
||||
class Config:
|
||||
|
@ -20,8 +26,13 @@ class Config:
|
|||
c.parse() # parses the values and initializes the Services and API clients
|
||||
# you can then access the Services and APIs like
|
||||
c.s3_config
|
||||
|
||||
All the configurations available as cmd line options, when included, will
|
||||
override the configurations in the config.json file.
|
||||
Configurations are split between:
|
||||
1. "secrets" containing API keys for generating services - not kept in memory
|
||||
2. "execution" containing specific execution configurations
|
||||
"""
|
||||
AVAILABLE_STORAGES = {"s3", "gd", "local"}
|
||||
|
||||
def __init__(self):
|
||||
self.parser = self.get_argument_parser()
|
||||
|
@ -38,37 +49,40 @@ class Config:
|
|||
with open(self.config_file, "r", encoding="utf-8") as inf:
|
||||
self.config = json.load(inf)
|
||||
|
||||
# ----------------------EXECUTION - execution configurations
|
||||
execution = self.config.get("execution", {})
|
||||
|
||||
# general sheet configurations
|
||||
self.sheet = getattr(self.args, "sheet") or execution.get("sheet")
|
||||
self.sheet = getattr(self.args, "sheet", execution.get("sheet"))
|
||||
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
||||
self.header = int(getattr(self.args, "header", execution.get("header", 1)))
|
||||
Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER)
|
||||
self.storage = getattr(self.args, "storage", execution.get("storage", "s3"))
|
||||
|
||||
self.header = int(getattr(self.args, "header") or execution.get("header", 1))
|
||||
self.tmp_folder = execution.get("tmp_folder", Storage.TMP_FOLDER)
|
||||
Storage.TMP_FOLDER = self.tmp_folder
|
||||
|
||||
self.storage = execution.get("storage", "s3")
|
||||
for key, name in [("s3", "s3"), ("gd", "google_drive")]:
|
||||
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
|
||||
|
||||
# Column names come from config and can be overwritten by CMD
|
||||
# in the end all are considered as lower case
|
||||
config_column_names = execution.get("column_names", {})
|
||||
self.column_names = {}
|
||||
for k in GWorksheet.COLUMN_NAMES.keys():
|
||||
self.column_names[k] = getattr(self.args, k) or config_column_names.get(k) or GWorksheet.COLUMN_NAMES[k]
|
||||
self.column_names = {k: v.lower() for k, v in self.column_names.items()}
|
||||
self.column_names[k] = getattr(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
|
||||
|
||||
# selenium driver
|
||||
selenium_configs = execution.get("selenium", {})
|
||||
self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10))
|
||||
self.selenium_config = SeleniumConfig(
|
||||
timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
|
||||
window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
|
||||
window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
|
||||
)
|
||||
self.webdriver = "not initalized"
|
||||
|
||||
# APIs and service configurations
|
||||
# ---------------------- SECRETS - APIs and service configurations
|
||||
secrets = self.config.get("secrets", {})
|
||||
|
||||
# google sheets config
|
||||
self.gsheets_client = gspread.service_account(
|
||||
filename=secrets.get("google_api", {}).get("filename", 'service_account.json')
|
||||
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
|
||||
)
|
||||
|
||||
# facebook config
|
||||
|
@ -81,15 +95,22 @@ class Config:
|
|||
bucket=s3["bucket"],
|
||||
region=s3["region"],
|
||||
key=s3["key"],
|
||||
secret=s3["secret"]
|
||||
secret=s3["secret"],
|
||||
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
|
||||
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
|
||||
key_path=s3.get("key_path", S3Config.key_path),
|
||||
private=getattr(self.args, "s3-private", s3.get("private", S3Config.private)),
|
||||
no_folder=s3.get("no_folder", S3Config.no_folder),
|
||||
)
|
||||
|
||||
# GDrive config
|
||||
if "google_drive" in secrets:
|
||||
gd = secrets["google_drive"]
|
||||
self.gd_config = GDConfig(
|
||||
root_folder_id=gd.get("root_folder_id"),
|
||||
default_folder=gd.get("default_folder", GDConfig.default_folder),
|
||||
service_account=gd.get("service_account", GDConfig.service_account),
|
||||
)
|
||||
self.s3_config.private = getattr(self.args, "private") or s3["private"] or self.s3_config.private
|
||||
self.s3_config.endpoint_url = s3["endpoint_url"] or self.s3_config.endpoint_url
|
||||
self.s3_config.cdn_url = s3["cdn_url"] or self.s3_config.cdn_url
|
||||
self.s3_config.key_path = s3["key_path"] or self.s3_config.key_path
|
||||
self.s3_config.no_folder = s3["no_folder"] or self.s3_config.no_folder
|
||||
else:
|
||||
logger.debug(f"'s3' key not present in the {self.config_file=}")
|
||||
|
||||
# wayback machine config
|
||||
if "wayback" in secrets:
|
||||
|
@ -109,32 +130,44 @@ class Config:
|
|||
else:
|
||||
logger.debug(f"'telegram' key not present in the {self.config_file=}")
|
||||
|
||||
del self.config["secrets"]
|
||||
del self.config["secrets"] # delete to prevent leaks
|
||||
|
||||
def get_argument_parser(self):
|
||||
parser = argparse.ArgumentParser(description='Automatically archive social media videos from a Google Sheets document')
|
||||
"""
|
||||
Creates the CMD line arguments. 'python auto_archive.py --help'
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. ')
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json')
|
||||
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.json]', choices=Config.AVAILABLE_STORAGES)
|
||||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.json]')
|
||||
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.json]')
|
||||
parser.add_argument('--private', action='store_true', help='Store content without public access permission [execution.header in config.json]')
|
||||
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]')
|
||||
|
||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||
parser.add_argument(f'--col-{k}', action='store', dest=k, help=f'the name of the column to fill with {k} (default={v})')
|
||||
parser.add_argument(f'--col-{k}', action='store', dest=k, help=f"name of the column to fill with {k} (default='{v}')")
|
||||
|
||||
return parser
|
||||
|
||||
def set_folder(self, folder):
|
||||
# update the folder in each of the storages
|
||||
self.folder = folder
|
||||
self.s3_config.folder = folder
|
||||
if self.s3_config:
|
||||
self.s3_config.folder = folder
|
||||
if self.gd_config:
|
||||
self.gd_config.default_folder = folder
|
||||
|
||||
def get_storage(self):
|
||||
"""
|
||||
creates and returns the configured type of storage
|
||||
"""
|
||||
if self.storage == "s3":
|
||||
return S3Storage(self.s3_config)
|
||||
elif self.storage == "gd":
|
||||
return GDStorage(self.gd_config)
|
||||
elif self.storage == "local":
|
||||
return LocalStorage(self.folder)
|
||||
raise f"storage {self.storage} not yet implemented"
|
||||
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
|
||||
|
||||
def destroy_webdriver(self):
|
||||
if self.webdriver is not None and type(self.webdriver) != str:
|
||||
|
@ -146,16 +179,17 @@ class Config:
|
|||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
self.webdriver = webdriver.Firefox(options=options)
|
||||
self.webdriver.set_window_size(1400, 2000)
|
||||
self.webdriver.set_page_load_timeout(self.selenium_timeout)
|
||||
self.webdriver.set_window_size(self.selenium_config.window_width,
|
||||
self.selenium_config.window_height)
|
||||
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return json.dumps({
|
||||
"config_file": self.config_file,
|
||||
"sheet": self.sheet,
|
||||
"header": self.header,
|
||||
"tmp_folder": self.tmp_folder,
|
||||
"selenium_timeout_seconds": self.selenium_timeout,
|
||||
"tmp_folder": Storage.TMP_FOLDER,
|
||||
"selenium_config": self.selenium_config,
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
"s3_config": self.s3_config != None,
|
||||
"s3_private": getattr(self.s3_config, "private", None),
|
||||
|
|
|
@ -12,14 +12,16 @@ import time
|
|||
@dataclass
|
||||
class GDConfig:
|
||||
root_folder_id: str
|
||||
default_upload_folder_name: str = "default"
|
||||
default_folder: str = "default"
|
||||
service_account: str = "service_account.json"
|
||||
|
||||
|
||||
class GDStorage(Storage):
|
||||
def __init__(self, config: GDConfig):
|
||||
self.default_upload_folder_name = config.default_upload_folder_name
|
||||
self.default_folder = config.default_folder
|
||||
self.root_folder_id = config.root_folder_id
|
||||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=['https://www.googleapis.com/auth/drive'])
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
|
@ -27,7 +29,7 @@ class GDStorage(Storage):
|
|||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
self.subfolder = self._clean_path(self.subfolder, self.default_upload_folder_name, False)
|
||||
self.subfolder = self._clean_path(self.subfolder, self.default_folder, False)
|
||||
filename = key
|
||||
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue