Merge pull request #55 from djhmateer/dev-upstream

pull/56/head
Miguel Sozinho Ramalho 2022-07-25 12:38:33 +01:00 zatwierdzone przez GitHub
commit 12918d4fce
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
6 zmienionych plików z 20 dodań i 9 usunięć

3
.gitignore vendored
Wyświetl plik

@ -16,4 +16,5 @@ config.yaml
config-*.yaml config-*.yaml
logs/* logs/*
local_archive/ local_archive/
vk_config*.json vk_config*.json
secrets/*

Wyświetl plik

@ -26,8 +26,8 @@ class ArchiveResult:
screenshot: str = None screenshot: str = None
hash: str = None hash: str = None
class Archiver(ABC): class Archiver(ABC):
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
name = "default" name = "default"
retry_regex = r"retrying at (\d+)$" retry_regex = r"retrying at (\d+)$"
@ -47,7 +47,6 @@ class Archiver(ABC):
def get_netloc(self, url): def get_netloc(self, url):
return urlparse(url).netloc return urlparse(url).netloc
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
""" """
Generates an index.html page where each @urls_info is displayed Generates an index.html page where each @urls_info is displayed
@ -163,10 +162,12 @@ class Archiver(ABC):
def get_hash(self, filename): def get_hash(self, filename):
with open(filename, "rb") as f: with open(filename, "rb") as f:
bytes = f.read() # read entire file as bytes bytes = f.read() # read entire file as bytes
# TODO: customizable hash logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
hash = hashlib.sha256(bytes)
# option to use SHA3_512 instead if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
# hash = hashlib.sha3_512(bytes) elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
return hash.hexdigest() return hash.hexdigest()
def get_screenshot(self, url): def get_screenshot(self, url):

Wyświetl plik

@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):
for u in urls: for u in urls:
if u is None: if u is None:
logger.error(f"Should not have gotten None url for {tweet.includes.media=}") logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
return self.download_alternative(url, tweet_id) return self.download_alternative(url, tweet_id)
logger.debug(f"found {urls=}") logger.debug(f"found {urls=}")

Wyświetl plik

@ -5,12 +5,12 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
""" """
This Twitter Archiver uses unofficial scraping methods, and it works as This Twitter Archiver uses unofficial scraping methods, and it works as
an alternative to TwitterApiArchiver when no API credentials are provided. an alternative to TwitterApiArchiver when no API credentials are provided.
""" """
name = "twitter" name = "twitter"
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

Wyświetl plik

@ -1,5 +1,6 @@
import argparse, yaml, json import argparse, yaml, json
from archivers.base_archiver import Archiver
import gspread import gspread
from loguru import logger from loguru import logger
from selenium import webdriver from selenium import webdriver
@ -81,6 +82,8 @@ class Config:
) )
self.webdriver = "not initialized" self.webdriver = "not initialized"
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
# ---------------------- SECRETS - APIs and service configurations # ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {}) secrets = self.config.get("secrets", {})
@ -259,6 +262,7 @@ class Config:
"storage": self.storage, "storage": self.storage,
"header": self.header, "header": self.header,
"check_if_exists": self.check_if_exists, "check_if_exists": self.check_if_exists,
"hash_algorithm": Archiver.HASH_ALGORITHM,
"save_logs": self.save_logs, "save_logs": self.save_logs,
"selenium_config": asdict(self.selenium_config), "selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None, "selenium_webdriver": self.webdriver != None,

Wyświetl plik

@ -80,6 +80,10 @@ execution:
storage: s3 storage: s3
# defaults to false, when true will try to avoid duplicate URL archives # defaults to false, when true will try to avoid duplicate URL archives
check_if_exists: true check_if_exists: true
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
# hash_algorithm: SHA-256
# optional configurations for the selenium browser that takes screenshots, these are the defaults # optional configurations for the selenium browser that takes screenshots, these are the defaults
selenium: selenium:
# values under 10s might mean screenshots fail to grab screenshot # values under 10s might mean screenshots fail to grab screenshot
@ -104,3 +108,4 @@ execution:
duration: duration duration: duration
screenshot: screenshot screenshot: screenshot
hash: hash hash: hash