kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge pull request #55 from djhmateer/dev-upstream
commit
12918d4fce
|
@ -16,4 +16,5 @@ config.yaml
|
||||||
config-*.yaml
|
config-*.yaml
|
||||||
logs/*
|
logs/*
|
||||||
local_archive/
|
local_archive/
|
||||||
vk_config*.json
|
vk_config*.json
|
||||||
|
secrets/*
|
|
@ -26,8 +26,8 @@ class ArchiveResult:
|
||||||
screenshot: str = None
|
screenshot: str = None
|
||||||
hash: str = None
|
hash: str = None
|
||||||
|
|
||||||
|
|
||||||
class Archiver(ABC):
|
class Archiver(ABC):
|
||||||
|
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
|
||||||
name = "default"
|
name = "default"
|
||||||
retry_regex = r"retrying at (\d+)$"
|
retry_regex = r"retrying at (\d+)$"
|
||||||
|
|
||||||
|
@ -47,7 +47,6 @@ class Archiver(ABC):
|
||||||
def get_netloc(self, url):
|
def get_netloc(self, url):
|
||||||
return urlparse(url).netloc
|
return urlparse(url).netloc
|
||||||
|
|
||||||
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
|
||||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||||
"""
|
"""
|
||||||
Generates an index.html page where each @urls_info is displayed
|
Generates an index.html page where each @urls_info is displayed
|
||||||
|
@ -163,10 +162,12 @@ class Archiver(ABC):
|
||||||
def get_hash(self, filename):
|
def get_hash(self, filename):
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
bytes = f.read() # read entire file as bytes
|
bytes = f.read() # read entire file as bytes
|
||||||
# TODO: customizable hash
|
logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
|
||||||
hash = hashlib.sha256(bytes)
|
|
||||||
# option to use SHA3_512 instead
|
if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
|
||||||
# hash = hashlib.sha3_512(bytes)
|
elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||||
|
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
|
||||||
|
|
||||||
return hash.hexdigest()
|
return hash.hexdigest()
|
||||||
|
|
||||||
def get_screenshot(self, url):
|
def get_screenshot(self, url):
|
||||||
|
|
|
@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):
|
||||||
|
|
||||||
for u in urls:
|
for u in urls:
|
||||||
if u is None:
|
if u is None:
|
||||||
logger.error(f"Should not have gotten None url for {tweet.includes.media=}")
|
logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
|
||||||
return self.download_alternative(url, tweet_id)
|
return self.download_alternative(url, tweet_id)
|
||||||
logger.debug(f"found {urls=}")
|
logger.debug(f"found {urls=}")
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,12 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||||
|
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
|
||||||
class TwitterArchiver(Archiver):
|
class TwitterArchiver(Archiver):
|
||||||
"""
|
"""
|
||||||
This Twitter Archiver uses unofficial scraping methods, and it works as
|
This Twitter Archiver uses unofficial scraping methods, and it works as
|
||||||
an alternative to TwitterApiArchiver when no API credentials are provided.
|
an alternative to TwitterApiArchiver when no API credentials are provided.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "twitter"
|
name = "twitter"
|
||||||
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
import argparse, yaml, json
|
import argparse, yaml, json
|
||||||
|
from archivers.base_archiver import Archiver
|
||||||
import gspread
|
import gspread
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
|
@ -81,6 +82,8 @@ class Config:
|
||||||
)
|
)
|
||||||
self.webdriver = "not initialized"
|
self.webdriver = "not initialized"
|
||||||
|
|
||||||
|
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
|
||||||
|
|
||||||
# ---------------------- SECRETS - APIs and service configurations
|
# ---------------------- SECRETS - APIs and service configurations
|
||||||
secrets = self.config.get("secrets", {})
|
secrets = self.config.get("secrets", {})
|
||||||
|
|
||||||
|
@ -259,6 +262,7 @@ class Config:
|
||||||
"storage": self.storage,
|
"storage": self.storage,
|
||||||
"header": self.header,
|
"header": self.header,
|
||||||
"check_if_exists": self.check_if_exists,
|
"check_if_exists": self.check_if_exists,
|
||||||
|
"hash_algorithm": Archiver.HASH_ALGORITHM,
|
||||||
"save_logs": self.save_logs,
|
"save_logs": self.save_logs,
|
||||||
"selenium_config": asdict(self.selenium_config),
|
"selenium_config": asdict(self.selenium_config),
|
||||||
"selenium_webdriver": self.webdriver != None,
|
"selenium_webdriver": self.webdriver != None,
|
||||||
|
|
|
@ -80,6 +80,10 @@ execution:
|
||||||
storage: s3
|
storage: s3
|
||||||
# defaults to false, when true will try to avoid duplicate URL archives
|
# defaults to false, when true will try to avoid duplicate URL archives
|
||||||
check_if_exists: true
|
check_if_exists: true
|
||||||
|
|
||||||
|
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
|
||||||
|
# hash_algorithm: SHA-256
|
||||||
|
|
||||||
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
||||||
selenium:
|
selenium:
|
||||||
# values under 10s might mean screenshots fail to grab screenshot
|
# values under 10s might mean screenshots fail to grab screenshot
|
||||||
|
@ -104,3 +108,4 @@ execution:
|
||||||
duration: duration
|
duration: duration
|
||||||
screenshot: screenshot
|
screenshot: screenshot
|
||||||
hash: hash
|
hash: hash
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue