From aa5430451e0a605fc75df38ecac400eef46c6aa0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 17 Feb 2023 15:46:29 +0000 Subject: [PATCH] instagram archiver via telegram bot --- src/auto_archiver/archivers/__init__.py | 1 + src/auto_archiver/archivers/telethon_archiver.py | 2 +- src/auto_archiver/archivers/twitter_archiver.py | 4 ++-- src/auto_archiver/core/metadata.py | 5 ++++- src/auto_archiver/core/orchestrator.py | 3 +++ src/auto_archiver/databases/gsheet_db.py | 8 +++----- src/auto_archiver/utils/__init__.py | 3 ++- src/auto_archiver/version.py | 4 ++-- 8 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index f6b5b05..f9cbb55 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver from .twitter_archiver import TwitterArchiver from .twitter_api_archiver import TwitterApiArchiver from .instagram_archiver import InstagramArchiver +from .instagram_tbot_archiver import InstagramTbotArchiver from .tiktok_archiver import TiktokArchiver from .telegram_archiver import TelegramArchiver from .vk_archiver import VkArchiver diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/archivers/telethon_archiver.py index 1490744..5cd6148 100644 --- a/src/auto_archiver/archivers/telethon_archiver.py +++ b/src/auto_archiver/archivers/telethon_archiver.py @@ -114,7 +114,7 @@ class TelethonArchiver(Archiver): with self.client.start(): # with self.client.start(bot_token=self.bot_token): try: - post = self.client.get_messages(chat, ids=post_id) + post = self.client.get_messages(chat, ids=post_id) except ValueError as e: logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") return False diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index 1128a57..d15ebf0 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -37,7 +37,7 @@ class TwitterArchiver(Archiver): return self.link_clean_pattern.sub("\\1", url) def is_rearchivable(self, url: str) -> bool: - # Twitter posts are static + # Twitter posts are static (for now) return False def download(self, item: Metadata) -> Metadata: @@ -86,7 +86,7 @@ class TwitterArchiver(Archiver): media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item) result.add_media(media) - return result.success("twitter") + return result.success("twitter-snscrape") def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: """ diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 00eefe9..b38af0c 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -63,6 +63,9 @@ class Metadata: def is_success(self) -> bool: return "success" in self.status + def is_empty(self) -> bool: + return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at + @property # getter .netloc def netloc(self) -> str: return urlparse(self.get_url()).netloc @@ -122,7 +125,7 @@ class Metadata: for m in self.media: if m.get("id") == id: return m return default - + def get_first_image(self, default=None) -> Media: for m in self.media: if "image" in m.mimetype: return m diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 47555f9..db3c893 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -123,6 +123,9 @@ class ArchivingOrchestrator: s.store(final_media, result) result.set_final_media(final_media) + if result.is_empty(): + result.status = "nothing archived" + # signal completion to databases (DBs, Google Sheets, CSV, ...) for d in self.databases: d.done(result) diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py index 03d149e..b28d8ed 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -2,10 +2,8 @@ from typing import Union, Tuple import datetime from urllib.parse import quote -# from metadata import Metadata from loguru import logger -# from . import Enricher from . import Database from ..core import Metadata from ..core import Media @@ -61,13 +59,13 @@ class GsheetsDb(Database): cell_updates.append((row, 'status', item.status)) media: Media = item.get_final_media() - - batch_if_valid('archive', "\n".join(media.urls)) + if hasattr(media, "urls"): + batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('timestamp', item.get_timestamp()) - if (screenshot := item.get_media_by_id("screenshot")): + if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): batch_if_valid('screenshot', "\n".join(screenshot.urls)) if (thumbnail := item.get_first_image("thumbnail")): diff --git a/src/auto_archiver/utils/__init__.py b/src/auto_archiver/utils/__init__.py index 05eca8b..42ea0f5 100644 --- a/src/auto_archiver/utils/__init__.py +++ b/src/auto_archiver/utils/__init__.py @@ -2,4 +2,5 @@ from .gworksheet import GWorksheet from .misc import * from .webdriver import Webdriver -from .gsheet import Gsheets \ No newline at end of file +from .gsheet import Gsheets +from .url import UrlUtil \ No newline at end of file diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py index 6473fec..d5d005a 100644 --- a/src/auto_archiver/version.py +++ b/src/auto_archiver/version.py @@ -1,9 +1,9 @@ _MAJOR = "0" -_MINOR = "3" +_MINOR = "4" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "0" +_PATCH = "1" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""