Added hash_algorithm to config to choose between SHA256 and SHA3_512

pull/55/head
Dave Mateer 2022-07-18 13:15:48 +01:00
rodzic 6d8be4c07f
commit 363a8ef67a
13 zmienionych plików z 48 dodań i 25 usunięć

2
.gitignore vendored
Wyświetl plik

@ -17,3 +17,5 @@ config-*.yaml
logs/* logs/*
local_archive/ local_archive/
vk_config*.json vk_config*.json
secrets/*

Wyświetl plik

@ -31,9 +31,10 @@ class Archiver(ABC):
name = "default" name = "default"
retry_regex = r"retrying at (\d+)$" retry_regex = r"retrying at (\d+)$"
def __init__(self, storage: Storage, driver): def __init__(self, storage: Storage, driver, hash_algorithm):
self.storage = storage self.storage = storage
self.driver = driver self.driver = driver
self.hash_algorithm = hash_algorithm
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
@ -163,10 +164,13 @@ class Archiver(ABC):
def get_hash(self, filename): def get_hash(self, filename):
with open(filename, "rb") as f: with open(filename, "rb") as f:
bytes = f.read() # read entire file as bytes bytes = f.read() # read entire file as bytes
# TODO: customizable hash ha = self.hash_algorithm
hash = hashlib.sha256(bytes) logger.debug(f'Hash algorithm is {ha}')
# option to use SHA3_512 instead
# hash = hashlib.sha3_512(bytes) if ha == "SHA3_512": hash = hashlib.sha3_512(bytes)
elif ha == "SHA256": hash = hashlib.sha256(bytes)
else: raise Exception("Unknown Hash Algorithm of {ha}")
return hash.hexdigest() return hash.hexdigest()
def get_screenshot(self, url): def get_screenshot(self, url):

Wyświetl plik

@ -11,6 +11,9 @@ from storages import Storage
class TelegramArchiver(Archiver): class TelegramArchiver(Archiver):
name = "telegram" name = "telegram"
def __init__(self, storage: Storage, driver, hash_algorithm):
super().__init__(storage, driver, hash_algorithm)
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
if 't.me' != self.get_netloc(url): if 't.me' != self.get_netloc(url):

Wyświetl plik

@ -15,8 +15,8 @@ class TelethonArchiver(Archiver):
name = "telethon" name = "telethon"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
def __init__(self, storage: Storage, driver, config: TelethonConfig): def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm):
super().__init__(storage, driver) super().__init__(storage, driver, hash_algorithm)
if config: if config:
self.client = TelegramClient("./anon", config.api_id, config.api_hash) self.client = TelegramClient("./anon", config.api_id, config.api_hash)
self.bot_token = config.bot_token self.bot_token = config.bot_token

Wyświetl plik

@ -15,6 +15,9 @@ class TiktokArchiver(Archiver):
status = 'success' status = 'success'
def __init__(self, storage: Storage, driver, hash_algorithm):
super().__init__(storage, driver, hash_algorithm)
try: try:
info = tiktok_downloader.info_post(url) info = tiktok_downloader.info_post(url)
key = self.get_key(f'{info.id}.mp4') key = self.get_key(f'{info.id}.mp4')

Wyświetl plik

@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver
class TwitterApiArchiver(TwitterArchiver): class TwitterApiArchiver(TwitterArchiver):
name = "twitter_api" name = "twitter_api"
def __init__(self, storage: Storage, driver, config: TwitterApiConfig): def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm):
super().__init__(storage, driver) super().__init__(storage, driver, hash_algorithm)
if config.bearer_token: if config.bearer_token:
self.api = Api(bearer_token=config.bearer_token) self.api = Api(bearer_token=config.bearer_token)
@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):
for u in urls: for u in urls:
if u is None: if u is None:
logger.error(f"Should not have gotten None url for {tweet.includes.media=}") logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
return self.download_alternative(url, tweet_id) return self.download_alternative(url, tweet_id)
logger.debug(f"found {urls=}") logger.debug(f"found {urls=}")

Wyświetl plik

@ -5,12 +5,16 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
""" """
This Twitter Archiver uses unofficial scraping methods, and it works as This Twitter Archiver uses unofficial scraping methods, and it works as
an alternative to TwitterApiArchiver when no API credentials are provided. an alternative to TwitterApiArchiver when no API credentials are provided.
""" """
def __init__(self, storage: Storage, driver, hash_algorithm):
super().__init__(storage, driver, hash_algorithm)
name = "twitter" name = "twitter"
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

Wyświetl plik

@ -17,8 +17,8 @@ class VkArchiver(Archiver):
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
def __init__(self, storage: Storage, driver, config: VkConfig): def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm):
super().__init__(storage, driver) super().__init__(storage, driver, hash_algorithm)
if config != None: if config != None:
self.vks = VkScraper(config.username, config.password) self.vks = VkScraper(config.username, config.password)

Wyświetl plik

@ -15,8 +15,8 @@ class WaybackArchiver(Archiver):
""" """
name = "wayback" name = "wayback"
def __init__(self, storage: Storage, driver, config: WaybackConfig): def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm):
super(WaybackArchiver, self).__init__(storage, driver) super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm)
self.config = config self.config = config
self.seen_urls = {} self.seen_urls = {}

Wyświetl plik

@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver):
name = "youtube_dl" name = "youtube_dl"
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
def __init__(self, storage: Storage, driver, fb_cookie): def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm):
super().__init__(storage, driver) super().__init__(storage, driver, hash_algorithm)
self.fb_cookie = fb_cookie self.fb_cookie = fb_cookie
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):

Wyświetl plik

@ -104,14 +104,14 @@ def process_sheet(c: Config):
# order matters, first to succeed excludes remaining # order matters, first to succeed excludes remaining
active_archivers = [ active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config), TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm),
TiktokArchiver(storage, c.webdriver), TiktokArchiver(storage, c.webdriver, c.hash_algorithm),
TwitterApiArchiver(storage, c.webdriver, c.twitter_config), TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm),
TelegramArchiver(storage, c.webdriver), TelegramArchiver(storage, c.webdriver, c.hash_algorithm),
TwitterArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver, c.hash_algorithm),
VkArchiver(storage, c.webdriver, c.vk_config), VkArchiver(storage, c.webdriver, c.vk_config, c.hash_algorithm),
WaybackArchiver(storage, c.webdriver, c.wayback_config) WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm)
] ]
for archiver in active_archivers: for archiver in active_archivers:

Wyświetl plik

@ -81,6 +81,8 @@ class Config:
) )
self.webdriver = "not initialized" self.webdriver = "not initialized"
self.hash_algorithm = execution.get("hash_algorithm")
# ---------------------- SECRETS - APIs and service configurations # ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {}) secrets = self.config.get("secrets", {})

Wyświetl plik

@ -104,3 +104,8 @@ execution:
duration: duration duration: duration
screenshot: screenshot screenshot: screenshot
hash: hash hash: hash
# Must be either SHA256 or SHA3_512
hash_algorithm: SHA3_512
# hash_algorithm: SHA256