kopia lustrzana https://github.com/bellingcat/auto-archiver
Added hash_algorithm to config to choose between SHA256 and SHA3_512
rodzic
6d8be4c07f
commit
363a8ef67a
|
@ -16,4 +16,6 @@ config.yaml
|
||||||
config-*.yaml
|
config-*.yaml
|
||||||
logs/*
|
logs/*
|
||||||
local_archive/
|
local_archive/
|
||||||
vk_config*.json
|
vk_config*.json
|
||||||
|
|
||||||
|
secrets/*
|
|
@ -31,9 +31,10 @@ class Archiver(ABC):
|
||||||
name = "default"
|
name = "default"
|
||||||
retry_regex = r"retrying at (\d+)$"
|
retry_regex = r"retrying at (\d+)$"
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver):
|
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||||
self.storage = storage
|
self.storage = storage
|
||||||
self.driver = driver
|
self.driver = driver
|
||||||
|
self.hash_algorithm = hash_algorithm
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
@ -163,10 +164,13 @@ class Archiver(ABC):
|
||||||
def get_hash(self, filename):
|
def get_hash(self, filename):
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
bytes = f.read() # read entire file as bytes
|
bytes = f.read() # read entire file as bytes
|
||||||
# TODO: customizable hash
|
ha = self.hash_algorithm
|
||||||
hash = hashlib.sha256(bytes)
|
logger.debug(f'Hash algorithm is {ha}')
|
||||||
# option to use SHA3_512 instead
|
|
||||||
# hash = hashlib.sha3_512(bytes)
|
if ha == "SHA3_512": hash = hashlib.sha3_512(bytes)
|
||||||
|
elif ha == "SHA256": hash = hashlib.sha256(bytes)
|
||||||
|
else: raise Exception("Unknown Hash Algorithm of {ha}")
|
||||||
|
|
||||||
return hash.hexdigest()
|
return hash.hexdigest()
|
||||||
|
|
||||||
def get_screenshot(self, url):
|
def get_screenshot(self, url):
|
||||||
|
|
|
@ -11,6 +11,9 @@ from storages import Storage
|
||||||
class TelegramArchiver(Archiver):
|
class TelegramArchiver(Archiver):
|
||||||
name = "telegram"
|
name = "telegram"
|
||||||
|
|
||||||
|
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||||
|
super().__init__(storage, driver, hash_algorithm)
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
if 't.me' != self.get_netloc(url):
|
if 't.me' != self.get_netloc(url):
|
||||||
|
|
|
@ -15,8 +15,8 @@ class TelethonArchiver(Archiver):
|
||||||
name = "telethon"
|
name = "telethon"
|
||||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver, config: TelethonConfig):
|
def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm):
|
||||||
super().__init__(storage, driver)
|
super().__init__(storage, driver, hash_algorithm)
|
||||||
if config:
|
if config:
|
||||||
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
|
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
|
||||||
self.bot_token = config.bot_token
|
self.bot_token = config.bot_token
|
||||||
|
|
|
@ -15,6 +15,9 @@ class TiktokArchiver(Archiver):
|
||||||
|
|
||||||
status = 'success'
|
status = 'success'
|
||||||
|
|
||||||
|
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||||
|
super().__init__(storage, driver, hash_algorithm)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
info = tiktok_downloader.info_post(url)
|
info = tiktok_downloader.info_post(url)
|
||||||
key = self.get_key(f'{info.id}.mp4')
|
key = self.get_key(f'{info.id}.mp4')
|
||||||
|
|
|
@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver
|
||||||
class TwitterApiArchiver(TwitterArchiver):
|
class TwitterApiArchiver(TwitterArchiver):
|
||||||
name = "twitter_api"
|
name = "twitter_api"
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
|
def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm):
|
||||||
super().__init__(storage, driver)
|
super().__init__(storage, driver, hash_algorithm)
|
||||||
|
|
||||||
if config.bearer_token:
|
if config.bearer_token:
|
||||||
self.api = Api(bearer_token=config.bearer_token)
|
self.api = Api(bearer_token=config.bearer_token)
|
||||||
|
@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):
|
||||||
|
|
||||||
for u in urls:
|
for u in urls:
|
||||||
if u is None:
|
if u is None:
|
||||||
logger.error(f"Should not have gotten None url for {tweet.includes.media=}")
|
logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
|
||||||
return self.download_alternative(url, tweet_id)
|
return self.download_alternative(url, tweet_id)
|
||||||
logger.debug(f"found {urls=}")
|
logger.debug(f"found {urls=}")
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,16 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||||
|
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
from storages import Storage
|
||||||
|
|
||||||
class TwitterArchiver(Archiver):
|
class TwitterArchiver(Archiver):
|
||||||
"""
|
"""
|
||||||
This Twitter Archiver uses unofficial scraping methods, and it works as
|
This Twitter Archiver uses unofficial scraping methods, and it works as
|
||||||
an alternative to TwitterApiArchiver when no API credentials are provided.
|
an alternative to TwitterApiArchiver when no API credentials are provided.
|
||||||
"""
|
"""
|
||||||
|
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||||
|
super().__init__(storage, driver, hash_algorithm)
|
||||||
|
|
||||||
name = "twitter"
|
name = "twitter"
|
||||||
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,8 @@ class VkArchiver(Archiver):
|
||||||
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
|
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||||
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
|
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver, config: VkConfig):
|
def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm):
|
||||||
super().__init__(storage, driver)
|
super().__init__(storage, driver, hash_algorithm)
|
||||||
if config != None:
|
if config != None:
|
||||||
self.vks = VkScraper(config.username, config.password)
|
self.vks = VkScraper(config.username, config.password)
|
||||||
|
|
||||||
|
|
|
@ -15,8 +15,8 @@ class WaybackArchiver(Archiver):
|
||||||
"""
|
"""
|
||||||
name = "wayback"
|
name = "wayback"
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm):
|
||||||
super(WaybackArchiver, self).__init__(storage, driver)
|
super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.seen_urls = {}
|
self.seen_urls = {}
|
||||||
|
|
||||||
|
|
|
@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver):
|
||||||
name = "youtube_dl"
|
name = "youtube_dl"
|
||||||
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||||
|
|
||||||
def __init__(self, storage: Storage, driver, fb_cookie):
|
def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm):
|
||||||
super().__init__(storage, driver)
|
super().__init__(storage, driver, hash_algorithm)
|
||||||
self.fb_cookie = fb_cookie
|
self.fb_cookie = fb_cookie
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
|
|
|
@ -104,14 +104,14 @@ def process_sheet(c: Config):
|
||||||
|
|
||||||
# order matters, first to succeed excludes remaining
|
# order matters, first to succeed excludes remaining
|
||||||
active_archivers = [
|
active_archivers = [
|
||||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm),
|
||||||
TiktokArchiver(storage, c.webdriver),
|
TiktokArchiver(storage, c.webdriver, c.hash_algorithm),
|
||||||
TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
|
TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm),
|
||||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm),
|
||||||
TelegramArchiver(storage, c.webdriver),
|
TelegramArchiver(storage, c.webdriver, c.hash_algorithm),
|
||||||
TwitterArchiver(storage, c.webdriver),
|
TwitterArchiver(storage, c.webdriver, c.hash_algorithm),
|
||||||
VkArchiver(storage, c.webdriver, c.vk_config),
|
VkArchiver(storage, c.webdriver, c.vk_config, c.hash_algorithm),
|
||||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm)
|
||||||
]
|
]
|
||||||
|
|
||||||
for archiver in active_archivers:
|
for archiver in active_archivers:
|
||||||
|
|
|
@ -81,6 +81,8 @@ class Config:
|
||||||
)
|
)
|
||||||
self.webdriver = "not initialized"
|
self.webdriver = "not initialized"
|
||||||
|
|
||||||
|
self.hash_algorithm = execution.get("hash_algorithm")
|
||||||
|
|
||||||
# ---------------------- SECRETS - APIs and service configurations
|
# ---------------------- SECRETS - APIs and service configurations
|
||||||
secrets = self.config.get("secrets", {})
|
secrets = self.config.get("secrets", {})
|
||||||
|
|
||||||
|
|
|
@ -104,3 +104,8 @@ execution:
|
||||||
duration: duration
|
duration: duration
|
||||||
screenshot: screenshot
|
screenshot: screenshot
|
||||||
hash: hash
|
hash: hash
|
||||||
|
|
||||||
|
# Must be either SHA256 or SHA3_512
|
||||||
|
hash_algorithm: SHA3_512
|
||||||
|
# hash_algorithm: SHA256
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue