Merge pull request #55 from djhmateer/dev-upstream

2022-07-25 12:38:33 +01:00 · 2022-07-25 12:38:33 +01:00 · 12918d4fce
commit 12918d4fce
--- a/.gitignore
+++ b/.gitignore
@ -17,3 +17,4 @@ config-*.yaml
 logs/*
 local_archive/
 vk_config*.json
+secrets/*
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@ -26,8 +26,8 @@ class ArchiveResult:
    screenshot: str = None
    hash: str = None

-
 class Archiver(ABC):
+    HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
    name = "default"
    retry_regex = r"retrying at (\d+)$"

@ -47,7 +47,6 @@ class Archiver(ABC):
    def get_netloc(self, url):
        return urlparse(url).netloc

-    # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
    def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
        """
        Generates an index.html page where each @urls_info is displayed
@ -163,10 +162,12 @@ class Archiver(ABC):
    def get_hash(self, filename):
        with open(filename, "rb") as f:
            bytes = f.read()  # read entire file as bytes
-            # TODO: customizable hash
-            hash = hashlib.sha256(bytes)
-            # option to use SHA3_512 instead
-            # hash = hashlib.sha3_512(bytes)
+            logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
+
+            if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
+            elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
+            else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
+
        return hash.hexdigest()

    def get_screenshot(self, url):
--- a/archivers/twitter_api_archiver.py
+++ b/archivers/twitter_api_archiver.py
@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):

            for u in urls:
                if u is None:
-                    logger.error(f"Should not have gotten None url for {tweet.includes.media=}")
+                    logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
                    return self.download_alternative(url, tweet_id)
        logger.debug(f"found {urls=}")

--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@ -5,12 +5,12 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo

 from .base_archiver import Archiver, ArchiveResult

-
 class TwitterArchiver(Archiver):
    """
    This Twitter Archiver uses unofficial scraping methods, and it works as 
    an alternative to TwitterApiArchiver when no API credentials are provided.
    """
+
    name = "twitter"
    link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

--- a/configs/config.py
+++ b/configs/config.py
@ -1,5 +1,6 @@

 import argparse, yaml, json
+from archivers.base_archiver import Archiver
 import gspread
 from loguru import logger
 from selenium import webdriver
@ -81,6 +82,8 @@ class Config:
        )
        self.webdriver = "not initialized"

+        Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
+
        # ---------------------- SECRETS - APIs and service configurations
        secrets = self.config.get("secrets", {})

@ -259,6 +262,7 @@ class Config:
            "storage": self.storage,
            "header": self.header,
            "check_if_exists": self.check_if_exists,
+            "hash_algorithm": Archiver.HASH_ALGORITHM,
            "save_logs": self.save_logs,
            "selenium_config": asdict(self.selenium_config),
            "selenium_webdriver": self.webdriver != None,
--- a/example.config.yaml
+++ b/example.config.yaml
@ -80,6 +80,10 @@ execution:
  storage: s3
  # defaults to false, when true will try to avoid duplicate URL archives
  check_if_exists: true
+
+  # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
+  # hash_algorithm: SHA-256
+
  # optional configurations for the selenium browser that takes screenshots, these are the defaults
  selenium:
    # values under 10s might mean screenshots fail to grab screenshot
@ -104,3 +108,4 @@ execution:
    duration: duration
    screenshot: screenshot
    hash: hash
+