kopia lustrzana https://github.com/bellingcat/auto-archiver
Archiving Context refactor complete
rodzic
906ed0f6e0
commit
6f6eb2db7a
|
@ -1,5 +1,5 @@
|
||||||
from .media import Media
|
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata
|
||||||
|
from .media import Media
|
||||||
from .step import Step
|
from .step import Step
|
||||||
from .context import ArchivingContext
|
from .context import ArchivingContext
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
class ArchivingContext:
|
class ArchivingContext:
|
||||||
"""
|
"""
|
||||||
|
@ -7,11 +9,15 @@ class ArchivingContext:
|
||||||
ArchivingContext.set(key, value)
|
ArchivingContext.set(key, value)
|
||||||
and
|
and
|
||||||
ArchivingContext.get(key, default)
|
ArchivingContext.get(key, default)
|
||||||
|
|
||||||
|
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
||||||
|
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
||||||
"""
|
"""
|
||||||
_instance = None
|
_instance = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.configs = {}
|
self.configs = {}
|
||||||
|
self.keep_on_reset = set()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_instance():
|
def get_instance():
|
||||||
|
@ -20,13 +26,22 @@ class ArchivingContext:
|
||||||
return ArchivingContext._instance
|
return ArchivingContext._instance
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set(key, value):
|
def set(key, value, keep_on_reset: bool = False):
|
||||||
ArchivingContext.get_instance().configs[key] = value
|
logger.error(f"SET [{key}]={value}")
|
||||||
|
ac = ArchivingContext.get_instance()
|
||||||
|
ac.configs[key] = value
|
||||||
|
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get(key: str, default=None):
|
def get(key: str, default=None):
|
||||||
return ArchivingContext.get_instance().configs.get(key, default)
|
return ArchivingContext.get_instance().configs.get(key, default)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def reset(full_reset: bool = False):
|
||||||
|
ac = ArchivingContext.get_instance()
|
||||||
|
if full_reset: ac.keep_on_reset = set()
|
||||||
|
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||||
|
|
||||||
# ---- custom getters/setters for widely used context values
|
# ---- custom getters/setters for widely used context values
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -3,9 +3,12 @@ from __future__ import annotations
|
||||||
from ast import List
|
from ast import List
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json
|
from dataclasses_json import dataclass_json, config
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
|
from .context import ArchivingContext
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
@dataclass_json # annotation order matters
|
@dataclass_json # annotation order matters
|
||||||
|
@ -14,8 +17,29 @@ class Media:
|
||||||
filename: str
|
filename: str
|
||||||
key: str = None
|
key: str = None
|
||||||
urls: List[str] = field(default_factory=list)
|
urls: List[str] = field(default_factory=list)
|
||||||
_mimetype: str = None # eg: image/jpeg
|
|
||||||
properties: dict = field(default_factory=dict)
|
properties: dict = field(default_factory=dict)
|
||||||
|
_mimetype: str = None # eg: image/jpeg
|
||||||
|
_stored: bool = field(default=False, repr=False, metadata=config(exclude=True))
|
||||||
|
|
||||||
|
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
|
||||||
|
# stores the media into the provided/available storages [Storage]
|
||||||
|
# repeats the process for its properties, in case they have inner media themselves
|
||||||
|
# for now it only goes down 1 level but it's easy to make it recursive if needed
|
||||||
|
storages = override_storages or ArchivingContext.get("storages")
|
||||||
|
if not len(storages):
|
||||||
|
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||||
|
return
|
||||||
|
|
||||||
|
for s in storages:
|
||||||
|
s.store(self, url)
|
||||||
|
# Media can be inside media properties, examples include transformations on original media
|
||||||
|
for prop in self.properties.values():
|
||||||
|
if isinstance(prop, Media):
|
||||||
|
s.store(prop, url)
|
||||||
|
if isinstance(prop, list):
|
||||||
|
for prop_media in prop:
|
||||||
|
if isinstance(prop_media, Media):
|
||||||
|
s.store(prop_media, url)
|
||||||
|
|
||||||
def set(self, key: str, value: Any) -> Media:
|
def set(self, key: str, value: Any) -> Media:
|
||||||
self.properties[key] = value
|
self.properties[key] = value
|
||||||
|
@ -44,10 +68,3 @@ class Media:
|
||||||
|
|
||||||
def is_audio(self) -> bool:
|
def is_audio(self) -> bool:
|
||||||
return self.mimetype.startswith("audio")
|
return self.mimetype.startswith("audio")
|
||||||
|
|
||||||
def store(self):
|
|
||||||
"""
|
|
||||||
either stores this media entry and all its media descendants
|
|
||||||
or returns if that process is already completed
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
|
@ -8,9 +8,10 @@ import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
from .media import Media
|
from .media import Media
|
||||||
|
from .context import ArchivingContext
|
||||||
|
|
||||||
# annotation order matters
|
|
||||||
@dataclass_json
|
@dataclass_json # annotation order matters
|
||||||
@dataclass
|
@dataclass
|
||||||
class Metadata:
|
class Metadata:
|
||||||
status: str = "no archiver"
|
status: str = "no archiver"
|
||||||
|
@ -23,7 +24,6 @@ class Metadata:
|
||||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True))
|
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True))
|
||||||
# tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called
|
# tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called
|
||||||
|
|
||||||
|
|
||||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||||
"""
|
"""
|
||||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||||
|
@ -46,6 +46,12 @@ class Metadata:
|
||||||
return right.merge(self)
|
return right.merge(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def store(self: Metadata, override_storages: List = None):
|
||||||
|
# calls .store for all contained media. storages [Storage]
|
||||||
|
storages = override_storages or ArchivingContext.get("storages")
|
||||||
|
for media in self.media:
|
||||||
|
media.store(override_storages=storages)
|
||||||
|
|
||||||
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
|
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
|
||||||
# if not self.metadata: self.metadata = {}
|
# if not self.metadata: self.metadata = {}
|
||||||
self.metadata[key] = val
|
self.metadata[key] = val
|
||||||
|
@ -144,4 +150,3 @@ class Metadata:
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
|
@ -25,7 +25,7 @@ class ArchivingOrchestrator:
|
||||||
self.archivers: List[Archiver] = config.archivers
|
self.archivers: List[Archiver] = config.archivers
|
||||||
self.databases: List[Database] = config.databases
|
self.databases: List[Database] = config.databases
|
||||||
self.storages: List[Storage] = config.storages
|
self.storages: List[Storage] = config.storages
|
||||||
ArchivingContext.set("storages", self.storages)
|
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||||
|
|
||||||
for a in self.archivers: a.setup()
|
for a in self.archivers: a.setup()
|
||||||
|
|
||||||
|
@ -35,6 +35,7 @@ class ArchivingOrchestrator:
|
||||||
|
|
||||||
def feed_item(self, item: Metadata) -> Metadata:
|
def feed_item(self, item: Metadata) -> Metadata:
|
||||||
try:
|
try:
|
||||||
|
ArchivingContext.reset()
|
||||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||||
ArchivingContext.set_tmp_dir(tmp_dir)
|
ArchivingContext.set_tmp_dir(tmp_dir)
|
||||||
return self.archive(item)
|
return self.archive(item)
|
||||||
|
@ -108,22 +109,12 @@ class ArchivingOrchestrator:
|
||||||
|
|
||||||
# 5 - store media
|
# 5 - store media
|
||||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||||
for s in self.storages:
|
result.store()
|
||||||
for m in result.media:
|
|
||||||
s.store(m, result) # modifies media
|
|
||||||
# Media can be inside media properties, examples include transformations on original media
|
|
||||||
for prop in m.properties.values():
|
|
||||||
if isinstance(prop, Media):
|
|
||||||
s.store(prop, result)
|
|
||||||
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
|
|
||||||
for prop_media in prop:
|
|
||||||
s.store(prop_media, result)
|
|
||||||
|
|
||||||
# 6 - format and store formatted if needed
|
# 6 - format and store formatted if needed
|
||||||
# enrichers typically need access to already stored URLs etc
|
# enrichers typically need access to already stored URLs etc
|
||||||
if (final_media := self.formatter.format(result)):
|
if (final_media := self.formatter.format(result)):
|
||||||
for s in self.storages:
|
final_media.store()
|
||||||
s.store(final_media, result)
|
|
||||||
result.set_final_media(final_media)
|
result.set_final_media(final_media)
|
||||||
|
|
||||||
if result.is_empty():
|
if result.is_empty():
|
||||||
|
|
|
@ -5,8 +5,7 @@ from urllib.parse import quote
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Database
|
from . import Database
|
||||||
from ..core import Metadata
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
from ..core import Media
|
|
||||||
from ..utils import GWorksheet
|
from ..utils import GWorksheet
|
||||||
|
|
||||||
|
|
||||||
|
@ -86,7 +85,7 @@ class GsheetsDb(Database):
|
||||||
logger.debug(f"Unable to update sheet: {e}")
|
logger.debug(f"Unable to update sheet: {e}")
|
||||||
|
|
||||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||||
gw: GWorksheet = item.get("gsheet").get("worksheet")
|
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
|
||||||
row: int = item.get("gsheet").get("row")
|
row: int = ArchivingContext.get("gsheet").get("row")
|
||||||
return gw, row
|
return gw, row
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Feeder
|
from . import Feeder
|
||||||
from ..core import Metadata
|
from ..core import Metadata, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class CLIFeeder(Feeder):
|
class CLIFeeder(Feeder):
|
||||||
|
@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
|
||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
logger.debug(f"Processing {url}")
|
logger.debug(f"Processing {url}")
|
||||||
yield Metadata().set_url(url).set("folder", "cli", True)
|
yield Metadata().set_url(url)
|
||||||
|
ArchivingContext.set("folder", "cli")
|
||||||
|
|
||||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
logger.success(f"Processed {len(self.urls)} URL(s)")
|
||||||
|
|
|
@ -5,9 +5,10 @@ from slugify import slugify
|
||||||
|
|
||||||
# from . import Enricher
|
# from . import Enricher
|
||||||
from . import Feeder
|
from . import Feeder
|
||||||
from ..core import Metadata
|
from ..core import Metadata, ArchivingContext
|
||||||
from ..utils import Gsheets, GWorksheet
|
from ..utils import Gsheets, GWorksheet
|
||||||
|
|
||||||
|
|
||||||
class GsheetsFeeder(Gsheets, Feeder):
|
class GsheetsFeeder(Gsheets, Feeder):
|
||||||
name = "gsheet_feeder"
|
name = "gsheet_feeder"
|
||||||
|
|
||||||
|
@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||||
},
|
},
|
||||||
"use_sheet_names_in_stored_paths":{
|
"use_sheet_names_in_stored_paths": {
|
||||||
"default": True,
|
"default": True,
|
||||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||||
}
|
}
|
||||||
|
@ -61,9 +62,10 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||||
if status not in ['', None]: continue
|
if status not in ['', None]: continue
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
|
m = Metadata().set_url(url)
|
||||||
|
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||||
if self.use_sheet_names_in_stored_paths:
|
if self.use_sheet_names_in_stored_paths:
|
||||||
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
logger.success(f'Finished worksheet {wks.title}')
|
logger.success(f'Finished worksheet {wks.title}')
|
||||||
|
|
|
@ -29,7 +29,7 @@
|
||||||
margin: auto;
|
margin: auto;
|
||||||
border: 1px solid;
|
border: 1px solid;
|
||||||
border-collapse: collapse;
|
border-collapse: collapse;
|
||||||
vertical-align:top;
|
vertical-align: top;
|
||||||
}
|
}
|
||||||
|
|
||||||
table.metadata td:first-child {
|
table.metadata td:first-child {
|
||||||
|
@ -185,7 +185,11 @@
|
||||||
el.addEventListener("copy", (e) => {
|
el.addEventListener("copy", (e) => {
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
if (e.clipboardData) {
|
if (e.clipboardData) {
|
||||||
|
if (el.hasAttribute("copy-value")) {
|
||||||
|
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
|
||||||
|
} else {
|
||||||
e.clipboardData.setData("text/plain", el.textContent);
|
e.clipboardData.setData("text/plain", el.textContent);
|
||||||
|
}
|
||||||
console.log(e.clipboardData.getData("text"))
|
console.log(e.clipboardData.getData("text"))
|
||||||
showNotification("copied!")
|
showNotification("copied!")
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if links %}
|
{% if links %}
|
||||||
<a href="{{ url }}">open</a> or
|
<a href="{{ url }}">open</a> or
|
||||||
<a href="{{ url }}" download="">download</a>
|
<a href="{{ url }}" download="">download</a> or
|
||||||
|
{{ copy_urlize(url, "copy") }}
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
{%- endmacro -%}
|
{%- endmacro -%}
|
||||||
|
|
||||||
{% macro copy_urlize(val) -%}
|
{% macro copy_urlize(val, href_text) -%}
|
||||||
|
|
||||||
{% if val is mapping %}
|
{% if val is mapping %}
|
||||||
<ul>
|
<ul>
|
||||||
|
@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
{% else %}
|
{% else %}
|
||||||
|
{% if href_text | length == 0 %}
|
||||||
<span class="copy">{{ val | string | urlize }}</span>
|
<span class="copy">{{ val | string | urlize }}</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
|
||||||
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{%- endmacro -%}
|
{%- endmacro -%}
|
|
@ -1,10 +1,9 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import hashlib
|
from typing import IO
|
||||||
from typing import IO, Any
|
|
||||||
|
|
||||||
from ..core import Media, Metadata, Step
|
from ..core import Media, Step, ArchivingContext
|
||||||
from ..enrichers import HashEnricher
|
from ..enrichers import HashEnricher
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import os, uuid
|
import os, uuid
|
||||||
|
@ -42,10 +41,11 @@ class Storage(Step):
|
||||||
# only for typing...
|
# only for typing...
|
||||||
return Step.init(name, config, Storage)
|
return Step.init(name, config, Storage)
|
||||||
|
|
||||||
def store(self, media: Media, item: Metadata) -> None:
|
def store(self, media: Media, url: str) -> None:
|
||||||
self.set_key(media, item)
|
self.set_key(media, url)
|
||||||
self.upload(media)
|
self.upload(media)
|
||||||
media.add_url(self.get_cdn_url(media))
|
media.add_url(self.get_cdn_url(media))
|
||||||
|
media.
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_cdn_url(self, media: Media) -> str: pass
|
def get_cdn_url(self, media: Media) -> str: pass
|
||||||
|
@ -58,19 +58,19 @@ class Storage(Step):
|
||||||
with open(media.filename, 'rb') as f:
|
with open(media.filename, 'rb') as f:
|
||||||
return self.uploadf(f, media, **kwargs)
|
return self.uploadf(f, media, **kwargs)
|
||||||
|
|
||||||
def set_key(self, media: Media, item: Metadata) -> None:
|
def set_key(self, media: Media, url) -> None:
|
||||||
"""takes the media and optionally item info and generates a key"""
|
"""takes the media and optionally item info and generates a key"""
|
||||||
if media.key is not None and len(media.key) > 0: return
|
if media.key is not None and len(media.key) > 0: return
|
||||||
folder = item.get("folder", "")
|
folder = ArchivingContext.get("folder", "")
|
||||||
filename, ext = os.path.splitext(media.filename)
|
filename, ext = os.path.splitext(media.filename)
|
||||||
|
|
||||||
# path_generator logic
|
# path_generator logic
|
||||||
if self.path_generator == "flat":
|
if self.path_generator == "flat":
|
||||||
path = ""
|
path = ""
|
||||||
filename = slugify(filename) # in case it comes with os.sep
|
filename = slugify(filename) # in case it comes with os.sep
|
||||||
elif self.path_generator == "url": path = slugify(item.get_url())
|
elif self.path_generator == "url": path = slugify(url)
|
||||||
elif self.path_generator == "random":
|
elif self.path_generator == "random":
|
||||||
path = item.get("random_path", str(uuid.uuid4())[:16], True)
|
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
|
||||||
|
|
||||||
# filename_generator logic
|
# filename_generator logic
|
||||||
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
||||||
|
|
Ładowanie…
Reference in New Issue