From 746f6a333ef41d9da92c5df689e5a54353f58e22 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 21 Jan 2023 19:57:54 +0000 Subject: [PATCH] further cleanup --- .../archivers/twitter_archiver.py | 1 - src/auto_archiver/core/config.py | 15 ++++++--------- src/auto_archiver/core/metadata.py | 10 +--------- src/auto_archiver/core/orchestrator.py | 1 - src/auto_archiver/databases/gsheet_db.py | 1 - .../formatters/html_formatter.py | 5 ----- src/auto_archiver/storages/storage.py | 1 + src/auto_archiver/utils/__init__.py | 1 - src/auto_archiver/utils/util.py | 19 ------------------- 9 files changed, 8 insertions(+), 46 deletions(-) delete mode 100644 src/auto_archiver/utils/util.py diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index 194025d..b5b6dda 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -105,7 +105,6 @@ class TwitterArchiver(Archiver): media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item) result.add_media(media) - # .set_title(tweet["TODO"]) result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")) return result diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 73bd585..206593e 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -6,7 +6,7 @@ from typing import List from collections import defaultdict from ..archivers import Archiver -from ..feeders import Feeder +from ..feeders import Feeder, CLIFeeder from ..databases import Database from ..formatters import Formatter from ..storages import Storage @@ -16,8 +16,6 @@ from ..enrichers import Enricher @dataclass class Config: - # TODO: should Config inherit from Step so it can have it's own configurations? - # these are only detected if they are put to the respective __init__.py configurable_parents = [ Feeder, Enricher, @@ -27,18 +25,17 @@ class Config: Formatter # Util ] - feeder: Step # TODO:= BaseFeeder + feeder: Feeder formatter: Formatter - archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type + archivers: List[Archiver] = field(default_factory=[]) enrichers: List[Enricher] = field(default_factory=[]) - storages: List[Step] = field(default_factory=[]) # TODO: fix type + storages: List[Storage] = field(default_factory=[]) databases: List[Database] = field(default_factory=[]) def __init__(self) -> None: self.defaults = {} self.cli_ops = {} self.config = {} - # TODO: make this work for nested props like gsheet_feeder.columns.url = "URL" def parse(self, use_cli=True, yaml_config_filename: str = None): """ @@ -49,7 +46,7 @@ class Config: if use_cli: parser = argparse.ArgumentParser( # prog = "auto-archiver", - description="Auto Archiver is a ...!", # TODO: update + description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", epilog="Check the code at https://github.com/bellingcat/auto-archiver" ) @@ -63,7 +60,7 @@ class Config: assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" config_path = f"{child.name}.{config}" - + if use_cli: try: parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index f09d7e0..4707a2f 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -22,9 +22,6 @@ class Metadata: final_media: Media = None # can be overwritten by formatters rearchivable: bool = False - # def __init__(self, url, metadata = {}) -> None: - # self.set_url(url) - # self.metadata = metadata def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ @@ -134,16 +131,11 @@ class Metadata: return self def get_single_media(self) -> Media: - # TODO: could be refactored to use a custom media.id + # TODO: could be refactored to use a custom media.id or metadata if self.final_media: return self.final_media return self.media[0] - # def as_json(self) -> str: - # # converts all metadata and data into JSON - # return json.dumps(self.metadata) - # #TODO: datetime is not serializable - def get_clean_metadata(self) -> Metadata: return dict( {k: v for k, v in self.metadata.items() if k not in self.tmp_keys}, diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index bb7902e..eaef270 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -132,7 +132,6 @@ class ArchivingOrchestrator: # a.download(result) # TODO: refactor so there's not merge here logger.info(f"Trying archiver {a.name}") result.merge(a.download(result)) - # TODO: fix logic to halt when done if result.is_success(): break # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py index 13660ca..557570e 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -42,7 +42,6 @@ class GsheetsDb(Database): def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check if the given item has been archived already""" - # TODO: this should not be done at the feeder stage then! return False def done(self, item: Metadata) -> None: diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/formatters/html_formatter.py index 9f5017f..74ba147 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/formatters/html_formatter.py @@ -35,11 +35,6 @@ class HtmlFormatter(Formatter): } def format(self, item: Metadata) -> Media: - media = item.media - # thumbnails - # TODO: thumbnails_media work per media, gah - # if self.detect_thumbnails: - content = self.template.render( url=item.get_url(), title=item.get_title(), diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py index 3e4134a..bfd22f7 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -38,6 +38,7 @@ class Storage(Step): return self.uploadf(f, media, **kwargs) def set_key(self, media: Media, item: Metadata) -> None: + #TODO: accept options to make these predictable or random """takes the media and optionally item info and generates a key""" if media.key is not None and len(media.key) > 0: return folder = item.get("folder", "") diff --git a/src/auto_archiver/utils/__init__.py b/src/auto_archiver/utils/__init__.py index a20e191..05eca8b 100644 --- a/src/auto_archiver/utils/__init__.py +++ b/src/auto_archiver/utils/__init__.py @@ -1,6 +1,5 @@ # we need to explicitly expose the available imports here from .gworksheet import GWorksheet from .misc import * -from .util import Util from .webdriver import Webdriver from .gsheet import Gsheets \ No newline at end of file diff --git a/src/auto_archiver/utils/util.py b/src/auto_archiver/utils/util.py deleted file mode 100644 index 898b260..0000000 --- a/src/auto_archiver/utils/util.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import annotations -from abc import abstractmethod -from dataclasses import dataclass -from ..core import Metadata, Step - -#TODO: likely unused -@dataclass -class Util(Step): - name = "util" - - def __init__(self, config: dict) -> None: - Step.__init__(self) - - # only for typing... - def init(name: str, config: dict) -> Util: - return super().init(name, config, Util) - - @abstractmethod - def enrich(self, item: Metadata) -> Metadata: pass