instagram archiver via telegram bot

pull/72/head
msramalho 2023-02-17 15:46:29 +00:00
rodzic f35875a94c
commit aa5430451e
8 zmienionych plików z 18 dodań i 12 usunięć

Wyświetl plik

@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver
from .twitter_archiver import TwitterArchiver from .twitter_archiver import TwitterArchiver
from .twitter_api_archiver import TwitterApiArchiver from .twitter_api_archiver import TwitterApiArchiver
from .instagram_archiver import InstagramArchiver from .instagram_archiver import InstagramArchiver
from .instagram_tbot_archiver import InstagramTbotArchiver
from .tiktok_archiver import TiktokArchiver from .tiktok_archiver import TiktokArchiver
from .telegram_archiver import TelegramArchiver from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver from .vk_archiver import VkArchiver

Wyświetl plik

@ -114,7 +114,7 @@ class TelethonArchiver(Archiver):
with self.client.start(): with self.client.start():
# with self.client.start(bot_token=self.bot_token): # with self.client.start(bot_token=self.bot_token):
try: try:
post = self.client.get_messages(chat, ids=post_id) post = self.client.get_messages(chat, ids=post_id)
except ValueError as e: except ValueError as e:
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
return False return False

Wyświetl plik

@ -37,7 +37,7 @@ class TwitterArchiver(Archiver):
return self.link_clean_pattern.sub("\\1", url) return self.link_clean_pattern.sub("\\1", url)
def is_rearchivable(self, url: str) -> bool: def is_rearchivable(self, url: str) -> bool:
# Twitter posts are static # Twitter posts are static (for now)
return False return False
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
@ -86,7 +86,7 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item) media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media) result.add_media(media)
return result.success("twitter") return result.success("twitter-snscrape")
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
""" """

Wyświetl plik

@ -63,6 +63,9 @@ class Metadata:
def is_success(self) -> bool: def is_success(self) -> bool:
return "success" in self.status return "success" in self.status
def is_empty(self) -> bool:
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
@property # getter .netloc @property # getter .netloc
def netloc(self) -> str: def netloc(self) -> str:
return urlparse(self.get_url()).netloc return urlparse(self.get_url()).netloc
@ -122,7 +125,7 @@ class Metadata:
for m in self.media: for m in self.media:
if m.get("id") == id: return m if m.get("id") == id: return m
return default return default
def get_first_image(self, default=None) -> Media: def get_first_image(self, default=None) -> Media:
for m in self.media: for m in self.media:
if "image" in m.mimetype: return m if "image" in m.mimetype: return m

Wyświetl plik

@ -123,6 +123,9 @@ class ArchivingOrchestrator:
s.store(final_media, result) s.store(final_media, result)
result.set_final_media(final_media) result.set_final_media(final_media)
if result.is_empty():
result.status = "nothing archived"
# signal completion to databases (DBs, Google Sheets, CSV, ...) # signal completion to databases (DBs, Google Sheets, CSV, ...)
for d in self.databases: d.done(result) for d in self.databases: d.done(result)

Wyświetl plik

@ -2,10 +2,8 @@ from typing import Union, Tuple
import datetime import datetime
from urllib.parse import quote from urllib.parse import quote
# from metadata import Metadata
from loguru import logger from loguru import logger
# from . import Enricher
from . import Database from . import Database
from ..core import Metadata from ..core import Metadata
from ..core import Media from ..core import Media
@ -61,13 +59,13 @@ class GsheetsDb(Database):
cell_updates.append((row, 'status', item.status)) cell_updates.append((row, 'status', item.status))
media: Media = item.get_final_media() media: Media = item.get_final_media()
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title()) batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp()) batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")): if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls)) batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")): if (thumbnail := item.get_first_image("thumbnail")):

Wyświetl plik

@ -2,4 +2,5 @@
from .gworksheet import GWorksheet from .gworksheet import GWorksheet
from .misc import * from .misc import *
from .webdriver import Webdriver from .webdriver import Webdriver
from .gsheet import Gsheets from .gsheet import Gsheets
from .url import UrlUtil

Wyświetl plik

@ -1,9 +1,9 @@
_MAJOR = "0" _MAJOR = "0"
_MINOR = "3" _MINOR = "4"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "0" _PATCH = "1"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""