kopia lustrzana https://github.com/bellingcat/auto-archiver
further cleanup
rodzic
9bd8ea0994
commit
746f6a333e
|
@ -105,7 +105,6 @@ class TwitterArchiver(Archiver):
|
|||
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item)
|
||||
result.add_media(media)
|
||||
|
||||
# .set_title(tweet["TODO"])
|
||||
result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
|
||||
return result
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from typing import List
|
|||
from collections import defaultdict
|
||||
|
||||
from ..archivers import Archiver
|
||||
from ..feeders import Feeder
|
||||
from ..feeders import Feeder, CLIFeeder
|
||||
from ..databases import Database
|
||||
from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
|
@ -16,8 +16,6 @@ from ..enrichers import Enricher
|
|||
|
||||
@dataclass
|
||||
class Config:
|
||||
# TODO: should Config inherit from Step so it can have it's own configurations?
|
||||
# these are only detected if they are put to the respective __init__.py
|
||||
configurable_parents = [
|
||||
Feeder,
|
||||
Enricher,
|
||||
|
@ -27,18 +25,17 @@ class Config:
|
|||
Formatter
|
||||
# Util
|
||||
]
|
||||
feeder: Step # TODO:= BaseFeeder
|
||||
feeder: Feeder
|
||||
formatter: Formatter
|
||||
archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type
|
||||
archivers: List[Archiver] = field(default_factory=[])
|
||||
enrichers: List[Enricher] = field(default_factory=[])
|
||||
storages: List[Step] = field(default_factory=[]) # TODO: fix type
|
||||
storages: List[Storage] = field(default_factory=[])
|
||||
databases: List[Database] = field(default_factory=[])
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.defaults = {}
|
||||
self.cli_ops = {}
|
||||
self.config = {}
|
||||
# TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
|
||||
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None):
|
||||
"""
|
||||
|
@ -49,7 +46,7 @@ class Config:
|
|||
if use_cli:
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
description="Auto Archiver is a ...!", # TODO: update
|
||||
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
|
||||
|
@ -63,7 +60,7 @@ class Config:
|
|||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
config_path = f"{child.name}.{config}"
|
||||
|
||||
|
||||
if use_cli:
|
||||
try:
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
|
|
|
@ -22,9 +22,6 @@ class Metadata:
|
|||
final_media: Media = None # can be overwritten by formatters
|
||||
rearchivable: bool = False
|
||||
|
||||
# def __init__(self, url, metadata = {}) -> None:
|
||||
# self.set_url(url)
|
||||
# self.metadata = metadata
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
|
@ -134,16 +131,11 @@ class Metadata:
|
|||
return self
|
||||
|
||||
def get_single_media(self) -> Media:
|
||||
# TODO: could be refactored to use a custom media.id
|
||||
# TODO: could be refactored to use a custom media.id or metadata
|
||||
if self.final_media:
|
||||
return self.final_media
|
||||
return self.media[0]
|
||||
|
||||
# def as_json(self) -> str:
|
||||
# # converts all metadata and data into JSON
|
||||
# return json.dumps(self.metadata)
|
||||
# #TODO: datetime is not serializable
|
||||
|
||||
def get_clean_metadata(self) -> Metadata:
|
||||
return dict(
|
||||
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
||||
|
|
|
@ -132,7 +132,6 @@ class ArchivingOrchestrator:
|
|||
# a.download(result) # TODO: refactor so there's not merge here
|
||||
logger.info(f"Trying archiver {a.name}")
|
||||
result.merge(a.download(result))
|
||||
# TODO: fix logic to halt when done
|
||||
if result.is_success(): break
|
||||
|
||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||
|
|
|
@ -42,7 +42,6 @@ class GsheetsDb(Database):
|
|||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
# TODO: this should not be done at the feeder stage then!
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata) -> None:
|
||||
|
|
|
@ -35,11 +35,6 @@ class HtmlFormatter(Formatter):
|
|||
}
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
media = item.media
|
||||
# thumbnails
|
||||
# TODO: thumbnails_media work per media, gah
|
||||
# if self.detect_thumbnails:
|
||||
|
||||
content = self.template.render(
|
||||
url=item.get_url(),
|
||||
title=item.get_title(),
|
||||
|
|
|
@ -38,6 +38,7 @@ class Storage(Step):
|
|||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, item: Metadata) -> None:
|
||||
#TODO: accept options to make these predictable or random
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = item.get("folder", "")
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .gworksheet import GWorksheet
|
||||
from .misc import *
|
||||
from .util import Util
|
||||
from .webdriver import Webdriver
|
||||
from .gsheet import Gsheets
|
|
@ -1,19 +0,0 @@
|
|||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from ..core import Metadata, Step
|
||||
|
||||
#TODO: likely unused
|
||||
@dataclass
|
||||
class Util(Step):
|
||||
name = "util"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
Step.__init__(self)
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Util:
|
||||
return super().init(name, config, Util)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, item: Metadata) -> Metadata: pass
|
Ładowanie…
Reference in New Issue