auto-archiver/src/metadata.py

96 wiersze
3.6 KiB
Python

from __future__ import annotations
from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
from datetime import datetime
# import json
from media import Media
@dataclass
class Metadata:
status: str = ""
metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
rearchivable: bool = False
# def __init__(self, url, metadata = {}) -> None:
# self.set_url(url)
# self.metadata = metadata
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
"""
if overwrite_left:
self.status = right.status
self.rearchivable |= right.rearchivable
for k, v in right.metadata.items():
assert k not in self.metadata or type(v) == type(self.get(k))
if type(v) not in [dict, list, set] or k not in self.metadata:
self.set(k, v)
else: # key conflict
if type(v) in [dict, set]: self.set(k, self.get(k) | v)
elif type(v) == list: self.set(k, self.get(k) + v)
self.media.extend(right.media)
else: # invert and do same logic
return right.merge(self)
return self
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
# if not self.metadata: self.metadata = {}
self.metadata[key] = val
if is_tmp: self.tmp_keys.add(key)
return self
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
# goes through metadata and returns the Metadata available
if create_if_missing and key not in self.metadata:
self.metadata[key] = default
return self.metadata.get(key, default)
# custom getter/setters
def set_url(self, url: str) -> Metadata:
assert type(url) is str and len(url) > 0, "invalid URL"
return self.set("url", url)
def get_url(self) -> str:
url = self.get("url")
assert type(url) is str and len(url) > 0, "invalid URL"
return url
def set_content(self, content: str) -> Metadata:
# the main textual content/information from a social media post, webpage, ...
return self.set("content", content)
def set_title(self, title: str) -> Metadata:
return self.set("title", title)
def set_timestamp(self, timestamp: datetime) -> Metadata:
assert type(timestamp) == datetime, "set_timestamp expects a datetime instance"
return self.set("timestamp", timestamp)
def add_media(self, media: Media) -> Metadata:
# print(f"adding {filename} to {self.metadata.get('media')}")
# return self.set("media", self.get_media() + [filename])
# return self.get_media().append(media)
return self.media.append(media)
# def as_json(self) -> str:
# # converts all metadata and data into JSON
# return json.dumps(self.metadata)
# #TODO: datetime is not serializable
def cleanup(self) -> Metadata:
#TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
# the code below leads to errors if database needs tmp_keys after they are removed
# """removes temporary metadata fields, ideally called after all ops except writing"""
# for tmp_key in self.tmp_keys:
# self.metadata.pop(tmp_key, None)
# self.tmp_keys = set()
pass