further cleanup

pull/72/head
msramalho 2023-01-21 19:57:54 +00:00
rodzic 9bd8ea0994
commit 746f6a333e
9 zmienionych plików z 8 dodań i 46 usunięć

Wyświetl plik

@ -105,7 +105,6 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item)
result.add_media(media)
# .set_title(tweet["TODO"])
result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result

Wyświetl plik

@ -6,7 +6,7 @@ from typing import List
from collections import defaultdict
from ..archivers import Archiver
from ..feeders import Feeder
from ..feeders import Feeder, CLIFeeder
from ..databases import Database
from ..formatters import Formatter
from ..storages import Storage
@ -16,8 +16,6 @@ from ..enrichers import Enricher
@dataclass
class Config:
# TODO: should Config inherit from Step so it can have it's own configurations?
# these are only detected if they are put to the respective __init__.py
configurable_parents = [
Feeder,
Enricher,
@ -27,18 +25,17 @@ class Config:
Formatter
# Util
]
feeder: Step # TODO:= BaseFeeder
feeder: Feeder
formatter: Formatter
archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type
archivers: List[Archiver] = field(default_factory=[])
enrichers: List[Enricher] = field(default_factory=[])
storages: List[Step] = field(default_factory=[]) # TODO: fix type
storages: List[Storage] = field(default_factory=[])
databases: List[Database] = field(default_factory=[])
def __init__(self) -> None:
self.defaults = {}
self.cli_ops = {}
self.config = {}
# TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
def parse(self, use_cli=True, yaml_config_filename: str = None):
"""
@ -49,7 +46,7 @@ class Config:
if use_cli:
parser = argparse.ArgumentParser(
# prog = "auto-archiver",
description="Auto Archiver is a ...!", # TODO: update
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
)
@ -63,7 +60,7 @@ class Config:
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
config_path = f"{child.name}.{config}"
if use_cli:
try:
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))

Wyświetl plik

@ -22,9 +22,6 @@ class Metadata:
final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False
# def __init__(self, url, metadata = {}) -> None:
# self.set_url(url)
# self.metadata = metadata
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
@ -134,16 +131,11 @@ class Metadata:
return self
def get_single_media(self) -> Media:
# TODO: could be refactored to use a custom media.id
# TODO: could be refactored to use a custom media.id or metadata
if self.final_media:
return self.final_media
return self.media[0]
# def as_json(self) -> str:
# # converts all metadata and data into JSON
# return json.dumps(self.metadata)
# #TODO: datetime is not serializable
def get_clean_metadata(self) -> Metadata:
return dict(
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},

Wyświetl plik

@ -132,7 +132,6 @@ class ArchivingOrchestrator:
# a.download(result) # TODO: refactor so there's not merge here
logger.info(f"Trying archiver {a.name}")
result.merge(a.download(result))
# TODO: fix logic to halt when done
if result.is_success(): break
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?

Wyświetl plik

@ -42,7 +42,6 @@ class GsheetsDb(Database):
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check if the given item has been archived already"""
# TODO: this should not be done at the feeder stage then!
return False
def done(self, item: Metadata) -> None:

Wyświetl plik

@ -35,11 +35,6 @@ class HtmlFormatter(Formatter):
}
def format(self, item: Metadata) -> Media:
media = item.media
# thumbnails
# TODO: thumbnails_media work per media, gah
# if self.detect_thumbnails:
content = self.template.render(
url=item.get_url(),
title=item.get_title(),

Wyświetl plik

@ -38,6 +38,7 @@ class Storage(Step):
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> None:
#TODO: accept options to make these predictable or random
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "")

Wyświetl plik

@ -1,6 +1,5 @@
# we need to explicitly expose the available imports here
from .gworksheet import GWorksheet
from .misc import *
from .util import Util
from .webdriver import Webdriver
from .gsheet import Gsheets

Wyświetl plik

@ -1,19 +0,0 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from ..core import Metadata, Step
#TODO: likely unused
@dataclass
class Util(Step):
name = "util"
def __init__(self, config: dict) -> None:
Step.__init__(self)
# only for typing...
def init(name: str, config: dict) -> Util:
return super().init(name, config, Util)
@abstractmethod
def enrich(self, item: Metadata) -> Metadata: pass