merge logic started

pull/72/head
msramalho 2022-12-14 16:11:06 +00:00
rodzic 53ffa2d4ae
commit 9c056d001c
4 zmienionych plików z 43 dodań i 17 usunięć

Wyświetl plik

@ -87,9 +87,11 @@ class TelethonArchiver(Archiverv2):
pbar.update() pbar.update()
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
"""
if this url is archivable will download post info and look for other posts from the same group with media.
can handle private/public channels
"""
url = item.get_url() url = item.get_url()
print(f"downloading {url=}")
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
match = self.link_pattern.search(url) match = self.link_pattern.search(url)
if not match: return False if not match: return False
@ -126,7 +128,8 @@ class TelethonArchiver(Archiverv2):
# media can also be in entities # media can also be in entities
if mp.entities: if mp.entities:
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
if len(other_media_urls):
logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
for om_url in other_media_urls: for om_url in other_media_urls:
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}')
@ -140,7 +143,7 @@ class TelethonArchiver(Archiverv2):
continue continue
result.add_media(filename) result.add_media(filename)
result.set("post", post).set_title(title).set_timestamp(post.date) result.set("post", str(post)).set_title(title).set_timestamp(post.date)
return result return result
def _get_media_posts_in_group(self, chat, original_post, max_amp=10): def _get_media_posts_in_group(self, chat, original_post, max_amp=10):

Wyświetl plik

@ -4,6 +4,7 @@ from ast import List
from typing import Any, Union, Dict from typing import Any, Union, Dict
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
import json
@dataclass @dataclass
@ -21,12 +22,25 @@ class Metadata:
self.status = status self.status = status
self.metadata = {} self.metadata = {}
# @staticmethod
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
# should return a merged version of the Metadata """
# will work for archived() and enriched() merges to Metadata instances, will overwrite according to overwrite_left flag
# what if 2 metadatas contain the same keys? only one can remain! : overwrite_left """
pass res = Metadata()
if overwrite_left:
res.status = right.status
res.metadata = dict(self.metadata) # make a copy
for k, v in right.metadata.items():
print(type(v), type(self.get(k)))
# assert type(v) == type(self.get(k))
if type(v) not in [dict, list, set] or k not in res.metadata:
res.set(k, v)
else: # key conflict
if type(v) in [dict, set]: res.set(k, self.get(k) | v)
elif type(v) == list: res.set(k, self.get(k) + v)
else: # invert and do same logic
return right.merge(self)
return res
# TODO: setters? # TODO: setters?
def set(self, key: str, val: Any) -> Metadata: def set(self, key: str, val: Any) -> Metadata:
@ -34,8 +48,10 @@ class Metadata:
self.metadata[key] = val self.metadata[key] = val
return self return self
def get(self, key: str, default: Any = None) -> Union[Metadata, str]: def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
# goes through metadata and returns the Metadata available # goes through metadata and returns the Metadata available
if create_if_missing and key not in self.metadata:
self.metadata[key] = default
return self.metadata.get(key, default) return self.metadata.get(key, default)
# custom getter/setters # custom getter/setters
@ -50,7 +66,11 @@ class Metadata:
return url return url
def get_media(self) -> List: def get_media(self) -> List:
return self.get("media", []) return self.get("media", [], create_if_missing=True)
def set_content(self, content: str) -> Metadata:
# the main textual content/information from a social media post, webpage, ...
return self.set("content", content)
def set_title(self, title: str) -> Metadata: def set_title(self, title: str) -> Metadata:
return self.set("title", title) return self.set("title", title)
@ -59,8 +79,10 @@ class Metadata:
return self.set("title", title) return self.set("title", title)
def add_media(self, filename: str) -> Metadata: def add_media(self, filename: str) -> Metadata:
# print(f"adding {filename} to {self.metadata.get('media')}")
# return self.set("media", self.get_media() + [filename])
return self.get_media().append(filename) return self.get_media().append(filename)
def as_json(self) -> str: def as_json(self) -> str:
# converts all metadata and data into JSON # converts all metadata and data into JSON
pass return json.dumps(self.metadata)

Wyświetl plik

@ -157,8 +157,9 @@ class ArchivingOrchestrator:
for url in self.feeder: for url in self.feeder:
print("ARCHIVING", url) print("ARCHIVING", url)
with tempfile.TemporaryDirectory(dir="./") as tmp_dir: with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
self.archive(url, tmp_dir) result = self.archive(url, tmp_dir)
print(result)
print(result.as_json())
print("holding on") print("holding on")
time.sleep(300) time.sleep(300)
# how does this handle the parameters like folder which can be different for each archiver? # how does this handle the parameters like folder which can be different for each archiver?

Wyświetl plik

@ -22,10 +22,10 @@ class Step(ABC):
def init(name: str, config: dict, child: Type[Step]) -> Step: def init(name: str, config: dict, child: Type[Step]) -> Step:
""" """
cannot find subclasses of child.subclasses looks into direct subclasses of child for name and returns such ab object
TODO: cannot find subclasses of child.subclasses
""" """
for sub in child.__subclasses__(): for sub in child.__subclasses__():
if sub.name == name: if sub.name == name:
print(sub.name, "CALLING NEW")
return sub(config) return sub(config)
raise ClassFoundException(f"Unable to initialize STEP with {name=}") raise ClassFoundException(f"Unable to initialize STEP with {name=}")