diff --git a/.gitignore b/.gitignore index 2885782..2059faa 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,5 @@ config.yaml config-*.yaml logs/* local_archive/ -vk_config*.json \ No newline at end of file +vk_config*.json +secrets/* \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 815d31e..902f626 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -26,8 +26,8 @@ class ArchiveResult: screenshot: str = None hash: str = None - class Archiver(ABC): + HASH_ALGORITHM="SHA-256" # can be overwritten by user configs name = "default" retry_regex = r"retrying at (\d+)$" @@ -47,7 +47,6 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc - # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): """ Generates an index.html page where each @urls_info is displayed @@ -163,10 +162,12 @@ class Archiver(ABC): def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes - # TODO: customizable hash - hash = hashlib.sha256(bytes) - # option to use SHA3_512 instead - # hash = hashlib.sha3_512(bytes) + logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}') + + if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes) + elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes) + else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}") + return hash.hexdigest() def get_screenshot(self, url): diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index ef2bf40..6aa1742 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver): for u in urls: if u is None: - logger.error(f"Should not have gotten None url for {tweet.includes.media=}") + logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver") return self.download_alternative(url, tweet_id) logger.debug(f"found {urls=}") diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 1c1b173..8f646fd 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -5,12 +5,12 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from .base_archiver import Archiver, ArchiveResult - class TwitterArchiver(Archiver): """ This Twitter Archiver uses unofficial scraping methods, and it works as an alternative to TwitterApiArchiver when no API credentials are provided. """ + name = "twitter" link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") diff --git a/configs/config.py b/configs/config.py index 41b531a..063c4d7 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,5 +1,6 @@ import argparse, yaml, json +from archivers.base_archiver import Archiver import gspread from loguru import logger from selenium import webdriver @@ -81,6 +82,8 @@ class Config: ) self.webdriver = "not initialized" + Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM) + # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -259,6 +262,7 @@ class Config: "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, + "hash_algorithm": Archiver.HASH_ALGORITHM, "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index 3092efc..2cded09 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -80,6 +80,10 @@ execution: storage: s3 # defaults to false, when true will try to avoid duplicate URL archives check_if_exists: true + + # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256) + # hash_algorithm: SHA-256 + # optional configurations for the selenium browser that takes screenshots, these are the defaults selenium: # values under 10s might mean screenshots fail to grab screenshot @@ -104,3 +108,4 @@ execution: duration: duration screenshot: screenshot hash: hash +