diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 267dc2d..4fa3ce0 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -87,9 +87,11 @@ class TelethonArchiver(Archiverv2): pbar.update() def download(self, item: Metadata) -> Metadata: + """ + if this url is archivable will download post info and look for other posts from the same group with media. + can handle private/public channels + """ url = item.get_url() - - print(f"downloading {url=}") # detect URLs that we definitely cannot handle match = self.link_pattern.search(url) if not match: return False @@ -126,8 +128,9 @@ class TelethonArchiver(Archiverv2): # media can also be in entities if mp.entities: - other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] - logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]] + if len(other_media_urls): + logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") for om_url in other_media_urls: filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') self.download_from_url(om_url, filename) @@ -140,7 +143,7 @@ class TelethonArchiver(Archiverv2): continue result.add_media(filename) - result.set("post", post).set_title(title).set_timestamp(post.date) + result.set("post", str(post)).set_title(title).set_timestamp(post.date) return result def _get_media_posts_in_group(self, chat, original_post, max_amp=10): diff --git a/src/metadata.py b/src/metadata.py index e1e8d8b..193003f 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -4,6 +4,7 @@ from ast import List from typing import Any, Union, Dict from dataclasses import dataclass from datetime import datetime +import json @dataclass @@ -21,12 +22,25 @@ class Metadata: self.status = status self.metadata = {} - # @staticmethod def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: - # should return a merged version of the Metadata - # will work for archived() and enriched() - # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left - pass + """ + merges to Metadata instances, will overwrite according to overwrite_left flag + """ + res = Metadata() + if overwrite_left: + res.status = right.status + res.metadata = dict(self.metadata) # make a copy + for k, v in right.metadata.items(): + print(type(v), type(self.get(k))) + # assert type(v) == type(self.get(k)) + if type(v) not in [dict, list, set] or k not in res.metadata: + res.set(k, v) + else: # key conflict + if type(v) in [dict, set]: res.set(k, self.get(k) | v) + elif type(v) == list: res.set(k, self.get(k) + v) + else: # invert and do same logic + return right.merge(self) + return res # TODO: setters? def set(self, key: str, val: Any) -> Metadata: @@ -34,8 +48,10 @@ class Metadata: self.metadata[key] = val return self - def get(self, key: str, default: Any = None) -> Union[Metadata, str]: + def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]: # goes through metadata and returns the Metadata available + if create_if_missing and key not in self.metadata: + self.metadata[key] = default return self.metadata.get(key, default) # custom getter/setters @@ -50,7 +66,11 @@ class Metadata: return url def get_media(self) -> List: - return self.get("media", []) + return self.get("media", [], create_if_missing=True) + + def set_content(self, content: str) -> Metadata: + # the main textual content/information from a social media post, webpage, ... + return self.set("content", content) def set_title(self, title: str) -> Metadata: return self.set("title", title) @@ -59,8 +79,10 @@ class Metadata: return self.set("title", title) def add_media(self, filename: str) -> Metadata: + # print(f"adding {filename} to {self.metadata.get('media')}") + # return self.set("media", self.get_media() + [filename]) return self.get_media().append(filename) def as_json(self) -> str: # converts all metadata and data into JSON - pass + return json.dumps(self.metadata) diff --git a/src/orchestrator.py b/src/orchestrator.py index 804948e..9a523bf 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -157,8 +157,9 @@ class ArchivingOrchestrator: for url in self.feeder: print("ARCHIVING", url) with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - self.archive(url, tmp_dir) - + result = self.archive(url, tmp_dir) + print(result) + print(result.as_json()) print("holding on") time.sleep(300) # how does this handle the parameters like folder which can be different for each archiver? diff --git a/src/steps/step.py b/src/steps/step.py index 4d7e6c1..7a2135c 100644 --- a/src/steps/step.py +++ b/src/steps/step.py @@ -22,10 +22,10 @@ class Step(ABC): def init(name: str, config: dict, child: Type[Step]) -> Step: """ - cannot find subclasses of child.subclasses + looks into direct subclasses of child for name and returns such ab object + TODO: cannot find subclasses of child.subclasses """ for sub in child.__subclasses__(): if sub.name == name: - print(sub.name, "CALLING NEW") return sub(config) raise ClassFoundException(f"Unable to initialize STEP with {name=}")