instagram archiver via telegram bot

pull/72/head
msramalho 2023-02-17 15:46:29 +00:00
rodzic f35875a94c
commit aa5430451e
8 zmienionych plików z 18 dodań i 12 usunięć

Wyświetl plik

@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver
from .twitter_archiver import TwitterArchiver
from .twitter_api_archiver import TwitterApiArchiver
from .instagram_archiver import InstagramArchiver
from .instagram_tbot_archiver import InstagramTbotArchiver
from .tiktok_archiver import TiktokArchiver
from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver

Wyświetl plik

@ -114,7 +114,7 @@ class TelethonArchiver(Archiver):
with self.client.start():
# with self.client.start(bot_token=self.bot_token):
try:
post = self.client.get_messages(chat, ids=post_id)
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
return False

Wyświetl plik

@ -37,7 +37,7 @@ class TwitterArchiver(Archiver):
return self.link_clean_pattern.sub("\\1", url)
def is_rearchivable(self, url: str) -> bool:
# Twitter posts are static
# Twitter posts are static (for now)
return False
def download(self, item: Metadata) -> Metadata:
@ -86,7 +86,7 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media)
return result.success("twitter")
return result.success("twitter-snscrape")
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
"""

Wyświetl plik

@ -63,6 +63,9 @@ class Metadata:
def is_success(self) -> bool:
return "success" in self.status
def is_empty(self) -> bool:
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
@property # getter .netloc
def netloc(self) -> str:
return urlparse(self.get_url()).netloc
@ -122,7 +125,7 @@ class Metadata:
for m in self.media:
if m.get("id") == id: return m
return default
def get_first_image(self, default=None) -> Media:
for m in self.media:
if "image" in m.mimetype: return m

Wyświetl plik

@ -123,6 +123,9 @@ class ArchivingOrchestrator:
s.store(final_media, result)
result.set_final_media(final_media)
if result.is_empty():
result.status = "nothing archived"
# signal completion to databases (DBs, Google Sheets, CSV, ...)
for d in self.databases: d.done(result)

Wyświetl plik

@ -2,10 +2,8 @@ from typing import Union, Tuple
import datetime
from urllib.parse import quote
# from metadata import Metadata
from loguru import logger
# from . import Enricher
from . import Database
from ..core import Metadata
from ..core import Media
@ -61,13 +59,13 @@ class GsheetsDb(Database):
cell_updates.append((row, 'status', item.status))
media: Media = item.get_final_media()
batch_if_valid('archive', "\n".join(media.urls))
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")):
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")):

Wyświetl plik

@ -2,4 +2,5 @@
from .gworksheet import GWorksheet
from .misc import *
from .webdriver import Webdriver
from .gsheet import Gsheets
from .gsheet import Gsheets
from .url import UrlUtil

Wyświetl plik

@ -1,9 +1,9 @@
_MAJOR = "0"
_MINOR = "3"
_MINOR = "4"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "0"
_PATCH = "1"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""