kopia lustrzana https://github.com/bellingcat/auto-archiver
instagram archiver via telegram bot
rodzic
f35875a94c
commit
aa5430451e
|
@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver
|
||||||
from .twitter_archiver import TwitterArchiver
|
from .twitter_archiver import TwitterArchiver
|
||||||
from .twitter_api_archiver import TwitterApiArchiver
|
from .twitter_api_archiver import TwitterApiArchiver
|
||||||
from .instagram_archiver import InstagramArchiver
|
from .instagram_archiver import InstagramArchiver
|
||||||
|
from .instagram_tbot_archiver import InstagramTbotArchiver
|
||||||
from .tiktok_archiver import TiktokArchiver
|
from .tiktok_archiver import TiktokArchiver
|
||||||
from .telegram_archiver import TelegramArchiver
|
from .telegram_archiver import TelegramArchiver
|
||||||
from .vk_archiver import VkArchiver
|
from .vk_archiver import VkArchiver
|
||||||
|
|
|
@ -114,7 +114,7 @@ class TelethonArchiver(Archiver):
|
||||||
with self.client.start():
|
with self.client.start():
|
||||||
# with self.client.start(bot_token=self.bot_token):
|
# with self.client.start(bot_token=self.bot_token):
|
||||||
try:
|
try:
|
||||||
post = self.client.get_messages(chat, ids=post_id)
|
post = self.client.get_messages(chat, ids=post_id)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -37,7 +37,7 @@ class TwitterArchiver(Archiver):
|
||||||
return self.link_clean_pattern.sub("\\1", url)
|
return self.link_clean_pattern.sub("\\1", url)
|
||||||
|
|
||||||
def is_rearchivable(self, url: str) -> bool:
|
def is_rearchivable(self, url: str) -> bool:
|
||||||
# Twitter posts are static
|
# Twitter posts are static (for now)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def download(self, item: Metadata) -> Metadata:
|
def download(self, item: Metadata) -> Metadata:
|
||||||
|
@ -86,7 +86,7 @@ class TwitterArchiver(Archiver):
|
||||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
|
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
|
|
||||||
return result.success("twitter")
|
return result.success("twitter-snscrape")
|
||||||
|
|
||||||
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
|
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -63,6 +63,9 @@ class Metadata:
|
||||||
def is_success(self) -> bool:
|
def is_success(self) -> bool:
|
||||||
return "success" in self.status
|
return "success" in self.status
|
||||||
|
|
||||||
|
def is_empty(self) -> bool:
|
||||||
|
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
|
||||||
|
|
||||||
@property # getter .netloc
|
@property # getter .netloc
|
||||||
def netloc(self) -> str:
|
def netloc(self) -> str:
|
||||||
return urlparse(self.get_url()).netloc
|
return urlparse(self.get_url()).netloc
|
||||||
|
@ -122,7 +125,7 @@ class Metadata:
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
if m.get("id") == id: return m
|
if m.get("id") == id: return m
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def get_first_image(self, default=None) -> Media:
|
def get_first_image(self, default=None) -> Media:
|
||||||
for m in self.media:
|
for m in self.media:
|
||||||
if "image" in m.mimetype: return m
|
if "image" in m.mimetype: return m
|
||||||
|
|
|
@ -123,6 +123,9 @@ class ArchivingOrchestrator:
|
||||||
s.store(final_media, result)
|
s.store(final_media, result)
|
||||||
result.set_final_media(final_media)
|
result.set_final_media(final_media)
|
||||||
|
|
||||||
|
if result.is_empty():
|
||||||
|
result.status = "nothing archived"
|
||||||
|
|
||||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||||
for d in self.databases: d.done(result)
|
for d in self.databases: d.done(result)
|
||||||
|
|
||||||
|
|
|
@ -2,10 +2,8 @@ from typing import Union, Tuple
|
||||||
import datetime
|
import datetime
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
# from metadata import Metadata
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
# from . import Enricher
|
|
||||||
from . import Database
|
from . import Database
|
||||||
from ..core import Metadata
|
from ..core import Metadata
|
||||||
from ..core import Media
|
from ..core import Media
|
||||||
|
@ -61,13 +59,13 @@ class GsheetsDb(Database):
|
||||||
cell_updates.append((row, 'status', item.status))
|
cell_updates.append((row, 'status', item.status))
|
||||||
|
|
||||||
media: Media = item.get_final_media()
|
media: Media = item.get_final_media()
|
||||||
|
if hasattr(media, "urls"):
|
||||||
batch_if_valid('archive', "\n".join(media.urls))
|
batch_if_valid('archive', "\n".join(media.urls))
|
||||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||||
batch_if_valid('title', item.get_title())
|
batch_if_valid('title', item.get_title())
|
||||||
batch_if_valid('text', item.get("content", "")[:500])
|
batch_if_valid('text', item.get("content", "")[:500])
|
||||||
batch_if_valid('timestamp', item.get_timestamp())
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
if (screenshot := item.get_media_by_id("screenshot")):
|
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||||
|
|
||||||
if (thumbnail := item.get_first_image("thumbnail")):
|
if (thumbnail := item.get_first_image("thumbnail")):
|
||||||
|
|
|
@ -2,4 +2,5 @@
|
||||||
from .gworksheet import GWorksheet
|
from .gworksheet import GWorksheet
|
||||||
from .misc import *
|
from .misc import *
|
||||||
from .webdriver import Webdriver
|
from .webdriver import Webdriver
|
||||||
from .gsheet import Gsheets
|
from .gsheet import Gsheets
|
||||||
|
from .url import UrlUtil
|
|
@ -1,9 +1,9 @@
|
||||||
|
|
||||||
_MAJOR = "0"
|
_MAJOR = "0"
|
||||||
_MINOR = "3"
|
_MINOR = "4"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "0"
|
_PATCH = "1"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|
Ładowanie…
Reference in New Issue