config refactor and cleanup

pull/33/head
msramalho 2022-06-03 17:32:25 +02:00
rodzic c679e02c73
commit a2fdfacb26
2 zmienionych plików z 74 dodań i 38 usunięć

Wyświetl plik

@ -3,13 +3,19 @@ import argparse, json
import gspread import gspread
from loguru import logger from loguru import logger
from selenium import webdriver from selenium import webdriver
from storages.local_storage import LocalStorage from dataclasses import dataclass
from utils.gworksheet import GWorksheet from utils.gworksheet import GWorksheet
from storages import S3Config, S3Storage
from .wayback_config import WaybackConfig from .wayback_config import WaybackConfig
from .telegram_config import TelegramConfig from .telegram_config import TelegramConfig
from storages import Storage from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage
@dataclass
class SeleniumConfig:
timeout_seconds: int = 120
window_width: int = 1400
window_height: int = 2000
class Config: class Config:
@ -20,8 +26,13 @@ class Config:
c.parse() # parses the values and initializes the Services and API clients c.parse() # parses the values and initializes the Services and API clients
# you can then access the Services and APIs like # you can then access the Services and APIs like
c.s3_config c.s3_config
All the configurations available as cmd line options, when included, will
override the configurations in the config.json file.
Configurations are split between:
1. "secrets" containing API keys for generating services - not kept in memory
2. "execution" containing specific execution configurations
""" """
AVAILABLE_STORAGES = {"s3", "gd", "local"}
def __init__(self): def __init__(self):
self.parser = self.get_argument_parser() self.parser = self.get_argument_parser()
@ -38,37 +49,40 @@ class Config:
with open(self.config_file, "r", encoding="utf-8") as inf: with open(self.config_file, "r", encoding="utf-8") as inf:
self.config = json.load(inf) self.config = json.load(inf)
# ----------------------EXECUTION - execution configurations
execution = self.config.get("execution", {}) execution = self.config.get("execution", {})
# general sheet configurations self.sheet = getattr(self.args, "sheet", execution.get("sheet"))
self.sheet = getattr(self.args, "sheet") or execution.get("sheet")
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
self.header = int(getattr(self.args, "header", execution.get("header", 1)))
Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER)
self.storage = getattr(self.args, "storage", execution.get("storage", "s3"))
self.header = int(getattr(self.args, "header") or execution.get("header", 1)) for key, name in [("s3", "s3"), ("gd", "google_drive")]:
self.tmp_folder = execution.get("tmp_folder", Storage.TMP_FOLDER) assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
Storage.TMP_FOLDER = self.tmp_folder
self.storage = execution.get("storage", "s3")
# Column names come from config and can be overwritten by CMD # Column names come from config and can be overwritten by CMD
# in the end all are considered as lower case # in the end all are considered as lower case
config_column_names = execution.get("column_names", {}) config_column_names = execution.get("column_names", {})
self.column_names = {} self.column_names = {}
for k in GWorksheet.COLUMN_NAMES.keys(): for k in GWorksheet.COLUMN_NAMES.keys():
self.column_names[k] = getattr(self.args, k) or config_column_names.get(k) or GWorksheet.COLUMN_NAMES[k] self.column_names[k] = getattr(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
self.column_names = {k: v.lower() for k, v in self.column_names.items()}
# selenium driver # selenium driver
selenium_configs = execution.get("selenium", {}) selenium_configs = execution.get("selenium", {})
self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10)) self.selenium_config = SeleniumConfig(
timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
)
self.webdriver = "not initalized" self.webdriver = "not initalized"
# APIs and service configurations # ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {}) secrets = self.config.get("secrets", {})
# google sheets config # google sheets config
self.gsheets_client = gspread.service_account( self.gsheets_client = gspread.service_account(
filename=secrets.get("google_api", {}).get("filename", 'service_account.json') filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
) )
# facebook config # facebook config
@ -81,15 +95,22 @@ class Config:
bucket=s3["bucket"], bucket=s3["bucket"],
region=s3["region"], region=s3["region"],
key=s3["key"], key=s3["key"],
secret=s3["secret"] secret=s3["secret"],
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
key_path=s3.get("key_path", S3Config.key_path),
private=getattr(self.args, "s3-private", s3.get("private", S3Config.private)),
no_folder=s3.get("no_folder", S3Config.no_folder),
)
# GDrive config
if "google_drive" in secrets:
gd = secrets["google_drive"]
self.gd_config = GDConfig(
root_folder_id=gd.get("root_folder_id"),
default_folder=gd.get("default_folder", GDConfig.default_folder),
service_account=gd.get("service_account", GDConfig.service_account),
) )
self.s3_config.private = getattr(self.args, "private") or s3["private"] or self.s3_config.private
self.s3_config.endpoint_url = s3["endpoint_url"] or self.s3_config.endpoint_url
self.s3_config.cdn_url = s3["cdn_url"] or self.s3_config.cdn_url
self.s3_config.key_path = s3["key_path"] or self.s3_config.key_path
self.s3_config.no_folder = s3["no_folder"] or self.s3_config.no_folder
else:
logger.debug(f"'s3' key not present in the {self.config_file=}")
# wayback machine config # wayback machine config
if "wayback" in secrets: if "wayback" in secrets:
@ -109,32 +130,44 @@ class Config:
else: else:
logger.debug(f"'telegram' key not present in the {self.config_file=}") logger.debug(f"'telegram' key not present in the {self.config_file=}")
del self.config["secrets"] del self.config["secrets"] # delete to prevent leaks
def get_argument_parser(self): def get_argument_parser(self):
parser = argparse.ArgumentParser(description='Automatically archive social media videos from a Google Sheets document') """
Creates the CMD line arguments. 'python auto_archive.py --help'
"""
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. ')
parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json') parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.json]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.json]') parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.json]')
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.json]') parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.json]')
parser.add_argument('--private', action='store_true', help='Store content without public access permission [execution.header in config.json]') parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]')
for k, v in GWorksheet.COLUMN_NAMES.items(): for k, v in GWorksheet.COLUMN_NAMES.items():
parser.add_argument(f'--col-{k}', action='store', dest=k, help=f'the name of the column to fill with {k} (default={v})') parser.add_argument(f'--col-{k}', action='store', dest=k, help=f"name of the column to fill with {k} (default='{v}')")
return parser return parser
def set_folder(self, folder): def set_folder(self, folder):
# update the folder in each of the storages # update the folder in each of the storages
self.folder = folder self.folder = folder
self.s3_config.folder = folder if self.s3_config:
self.s3_config.folder = folder
if self.gd_config:
self.gd_config.default_folder = folder
def get_storage(self): def get_storage(self):
"""
creates and returns the configured type of storage
"""
if self.storage == "s3": if self.storage == "s3":
return S3Storage(self.s3_config) return S3Storage(self.s3_config)
elif self.storage == "gd":
return GDStorage(self.gd_config)
elif self.storage == "local": elif self.storage == "local":
return LocalStorage(self.folder) return LocalStorage(self.folder)
raise f"storage {self.storage} not yet implemented" raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
def destroy_webdriver(self): def destroy_webdriver(self):
if self.webdriver is not None and type(self.webdriver) != str: if self.webdriver is not None and type(self.webdriver) != str:
@ -146,16 +179,17 @@ class Config:
options.headless = True options.headless = True
options.set_preference('network.protocol-handler.external.tg', False) options.set_preference('network.protocol-handler.external.tg', False)
self.webdriver = webdriver.Firefox(options=options) self.webdriver = webdriver.Firefox(options=options)
self.webdriver.set_window_size(1400, 2000) self.webdriver.set_window_size(self.selenium_config.window_width,
self.webdriver.set_page_load_timeout(self.selenium_timeout) self.selenium_config.window_height)
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
def __str__(self) -> str: def __str__(self) -> str:
return json.dumps({ return json.dumps({
"config_file": self.config_file, "config_file": self.config_file,
"sheet": self.sheet, "sheet": self.sheet,
"header": self.header, "header": self.header,
"tmp_folder": self.tmp_folder, "tmp_folder": Storage.TMP_FOLDER,
"selenium_timeout_seconds": self.selenium_timeout, "selenium_config": self.selenium_config,
"selenium_webdriver": self.webdriver != None, "selenium_webdriver": self.webdriver != None,
"s3_config": self.s3_config != None, "s3_config": self.s3_config != None,
"s3_private": getattr(self.s3_config, "private", None), "s3_private": getattr(self.s3_config, "private", None),

Wyświetl plik

@ -12,14 +12,16 @@ import time
@dataclass @dataclass
class GDConfig: class GDConfig:
root_folder_id: str root_folder_id: str
default_upload_folder_name: str = "default" default_folder: str = "default"
service_account: str = "service_account.json"
class GDStorage(Storage): class GDStorage(Storage):
def __init__(self, config: GDConfig): def __init__(self, config: GDConfig):
self.default_upload_folder_name = config.default_upload_folder_name self.default_folder = config.default_folder
self.root_folder_id = config.root_folder_id self.root_folder_id = config.root_folder_id
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=['https://www.googleapis.com/auth/drive']) creds = service_account.Credentials.from_service_account_file(
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
self.service = build('drive', 'v3', credentials=creds) self.service = build('drive', 'v3', credentials=creds)
def get_cdn_url(self, key): def get_cdn_url(self, key):
@ -27,7 +29,7 @@ class GDStorage(Storage):
only support files saved in a folder for GD only support files saved in a folder for GD
S3 supports folder and all stored in the root S3 supports folder and all stored in the root
""" """
self.subfolder = self._clean_path(self.subfolder, self.default_upload_folder_name, False) self.subfolder = self._clean_path(self.subfolder, self.default_folder, False)
filename = key filename = key
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD') logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')