From 3dd3775cbd6ffc29542e4eef6602c95dbfc09a80 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 27 Jul 2023 20:14:50 +0100 Subject: [PATCH] removes rearchiving logic --- src/auto_archiver/archivers/archiver.py | 5 ----- src/auto_archiver/archivers/telegram_archiver.py | 4 ---- src/auto_archiver/archivers/telethon_archiver.py | 4 ---- src/auto_archiver/archivers/tiktok_archiver.py | 4 ---- src/auto_archiver/archivers/twitter_archiver.py | 8 ++++---- src/auto_archiver/archivers/vk_archiver.py | 4 ---- src/auto_archiver/core/metadata.py | 2 -- src/auto_archiver/core/orchestrator.py | 8 ++------ 8 files changed, 6 insertions(+), 33 deletions(-) diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index 419ee7a..324c474 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -27,11 +27,6 @@ class Archiver(Step): # used to clean unnecessary URL parameters OR unfurl redirect links return url - def is_rearchivable(self, url: str) -> bool: - # archivers can signal if it does not make sense to rearchive a piece of content - # default is rearchiving - return True - def _guess_file_type(self, path: str) -> str: """ Receives a URL or filename and returns global mimetype like 'image' or 'video' diff --git a/src/auto_archiver/archivers/telegram_archiver.py b/src/auto_archiver/archivers/telegram_archiver.py index cc039f9..ed57927 100644 --- a/src/auto_archiver/archivers/telegram_archiver.py +++ b/src/auto_archiver/archivers/telegram_archiver.py @@ -19,10 +19,6 @@ class TelegramArchiver(Archiver): def configs() -> dict: return {} - def is_rearchivable(self, url: str) -> bool: - # telegram posts are static - return False - def download(self, item: Metadata) -> Metadata: url = item.get_url() # detect URLs that we definitely cannot handle diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/archivers/telethon_archiver.py index 67b5b59..14a0ac1 100644 --- a/src/auto_archiver/archivers/telethon_archiver.py +++ b/src/auto_archiver/archivers/telethon_archiver.py @@ -38,10 +38,6 @@ class TelethonArchiver(Archiver): } } - def is_rearchivable(self, url: str) -> bool: - # telegram posts are static - return False - def setup(self) -> None: """ 1. trigger login process for telegram or proceed if already saved in a session file diff --git a/src/auto_archiver/archivers/tiktok_archiver.py b/src/auto_archiver/archivers/tiktok_archiver.py index e1fc88f..5871775 100644 --- a/src/auto_archiver/archivers/tiktok_archiver.py +++ b/src/auto_archiver/archivers/tiktok_archiver.py @@ -16,10 +16,6 @@ class TiktokArchiver(Archiver): def configs() -> dict: return {} - def is_rearchivable(self, url: str) -> bool: - # TikTok posts are static - return False - def download(self, item: Metadata) -> Metadata: url = item.get_url() if 'tiktok.com' not in url: diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index c5d907d..1624484 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -37,9 +37,8 @@ class TwitterArchiver(Archiver): # https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w return self.link_clean_pattern.sub("\\1", url) - def is_rearchivable(self, url: str) -> bool: - # Twitter posts are static (for now) - return False + def best_quality_url(self, url: str) -> str: + return re.sub(r"name=(\w+)", "name=orig", url, 1) def download(self, item: Metadata) -> Metadata: """ @@ -78,7 +77,7 @@ class TwitterArchiver(Archiver): media.set("src", variant.url) mimetype = variant.contentType elif type(tweet_media) == Photo: - media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig')) + media.set("src", self.best_quality_url(tweet_media.fullUrl)) mimetype = "image/jpeg" else: logger.warning(f"Could not get media URL of {tweet_media}") @@ -118,6 +117,7 @@ class TwitterArchiver(Archiver): for i, u in enumerate(urls): media = Media(filename="") + u = self.best_quality_url(u) media.set("src", u) ext = "" if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]): diff --git a/src/auto_archiver/archivers/vk_archiver.py b/src/auto_archiver/archivers/vk_archiver.py index 8defb96..f8bb60a 100644 --- a/src/auto_archiver/archivers/vk_archiver.py +++ b/src/auto_archiver/archivers/vk_archiver.py @@ -27,10 +27,6 @@ class VkArchiver(Archiver): "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, } - def is_rearchivable(self, url: str) -> bool: - # VK content is static - return False - def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 4ef9388..56bde98 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -16,7 +16,6 @@ class Metadata: status: str = "no archiver" metadata: Dict[str, Any] = field(default_factory=dict) media: List[Media] = field(default_factory=list) - rearchivable: bool = True # defaults to true, archivers can overwrite def __post_init__(self): self.set("_processed_at", datetime.datetime.utcnow()) @@ -29,7 +28,6 @@ class Metadata: if overwrite_left: if right.status and len(right.status): self.status = right.status - self.rearchivable |= right.rearchivable for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 8ac2ddf..1946268 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -62,11 +62,7 @@ class ArchivingOrchestrator: result.set_url(url) if original_url != url: result.set("original_url", original_url) - # 2 - rearchiving logic + notify start to DB - # archivers can signal whether the content is rearchivable: eg: tweet vs webpage - for a in self.archivers: result.rearchivable |= a.is_rearchivable(url) - logger.debug(f"{result.rearchivable=} for {url=}") - + # 2 - notify start to DB # signal to DB that archiving has started # and propagate already archived if it exists cached_result = None @@ -78,7 +74,7 @@ class ArchivingOrchestrator: d.started(result) if (local_result := d.fetch(result)): cached_result = (cached_result or Metadata()).merge(local_result) - if cached_result and not cached_result.rearchivable: + if cached_result: logger.debug("Found previously archived entry") for d in self.databases: d.done(cached_result)