Merge pull request #58 from bellingcat/dev

pull/62/head
Miguel Sozinho Ramalho 2022-09-21 18:53:13 +02:00 zatwierdzone przez GitHub
commit 0bd9e043ed
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
10 zmienionych plików z 202 dodań i 22 usunięć

5
.gitignore vendored
Wyświetl plik

@ -16,4 +16,7 @@ config.yaml
config-*.yaml config-*.yaml
logs/* logs/*
local_archive/ local_archive/
vk_config*.json vk_config*.json
gd-token.json
credentials.json
secrets/*

Wyświetl plik

@ -26,8 +26,8 @@ class ArchiveResult:
screenshot: str = None screenshot: str = None
hash: str = None hash: str = None
class Archiver(ABC): class Archiver(ABC):
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
name = "default" name = "default"
retry_regex = r"retrying at (\d+)$" retry_regex = r"retrying at (\d+)$"
@ -47,7 +47,6 @@ class Archiver(ABC):
def get_netloc(self, url): def get_netloc(self, url):
return urlparse(url).netloc return urlparse(url).netloc
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
""" """
Generates an index.html page where each @urls_info is displayed Generates an index.html page where each @urls_info is displayed
@ -163,10 +162,12 @@ class Archiver(ABC):
def get_hash(self, filename): def get_hash(self, filename):
with open(filename, "rb") as f: with open(filename, "rb") as f:
bytes = f.read() # read entire file as bytes bytes = f.read() # read entire file as bytes
# TODO: customizable hash logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
hash = hashlib.sha256(bytes)
# option to use SHA3_512 instead if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
# hash = hashlib.sha3_512(bytes) elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
return hash.hexdigest() return hash.hexdigest()
def get_screenshot(self, url): def get_screenshot(self, url):

Wyświetl plik

@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):
for u in urls: for u in urls:
if u is None: if u is None:
logger.error(f"Should not have gotten None url for {tweet.includes.media=}") logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
return self.download_alternative(url, tweet_id) return self.download_alternative(url, tweet_id)
logger.debug(f"found {urls=}") logger.debug(f"found {urls=}")

Wyświetl plik

@ -5,12 +5,12 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
""" """
This Twitter Archiver uses unofficial scraping methods, and it works as This Twitter Archiver uses unofficial scraping methods, and it works as
an alternative to TwitterApiArchiver when no API credentials are provided. an alternative to TwitterApiArchiver when no API credentials are provided.
""" """
name = "twitter" name = "twitter"
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

Wyświetl plik

@ -53,11 +53,25 @@ def missing_required_columns(gw: GWorksheet):
return missing return missing
def should_process_sheet(c, sheet_name):
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
# ALLOW rules exist AND sheet name not explicitly allowed
return False
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
# BLOCK rules exist AND sheet name is blocked
return False
return True
def process_sheet(c: Config): def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet) sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check # loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()): for ii, wks in enumerate(sh.worksheets()):
if not should_process_sheet(c, wks.title):
logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
continue
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
@ -80,7 +94,7 @@ def process_sheet(c: Config):
if not is_retry: continue if not is_retry: continue
# All checks done - archival process starts here # All checks done - archival process starts here
try: try:
gw.set_cell(row, 'status', 'Archive in progress') gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url) url = expand_url(url)
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
@ -96,7 +110,7 @@ def process_sheet(c: Config):
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver), TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver),
VkArchiver(storage, c.webdriver, c.vk_config), VkArchiver(storage, c.webdriver, c.vk_config),
WaybackArchiver(storage, c.webdriver, c.wayback_config) WaybackArchiver(storage, c.webdriver, c.wayback_config)
] ]
@ -105,7 +119,7 @@ def process_sheet(c: Config):
try: try:
result = archiver.download(url, check_if_exists=c.check_if_exists) result = archiver.download(url, check_if_exists=c.check_if_exists)
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except Exception as e: except Exception as e:
result = False result = False
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')

Wyświetl plik

@ -1,5 +1,6 @@
import argparse, yaml, json import argparse, yaml, json
from archivers.base_archiver import Archiver
import gspread import gspread
from loguru import logger from loguru import logger
from selenium import webdriver from selenium import webdriver
@ -50,6 +51,14 @@ class Config:
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
def ensure_set(l):
# always returns a set of strings, can receive a set or a string
l = l if isinstance(l, list) else [l]
return set([x for x in l if isinstance(x, str) and len(x) > 0])
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False) self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
@ -73,6 +82,8 @@ class Config:
) )
self.webdriver = "not initialized" self.webdriver = "not initialized"
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
# ---------------------- SECRETS - APIs and service configurations # ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {}) secrets = self.config.get("secrets", {})
@ -107,6 +118,7 @@ class Config:
gd = secrets["google_drive"] gd = secrets["google_drive"]
self.gd_config = GDConfig( self.gd_config = GDConfig(
root_folder_id=gd.get("root_folder_id"), root_folder_id=gd.get("root_folder_id"),
oauth_token_filename=gd.get("oauth_token_filename"),
service_account=gd.get("service_account", GDConfig.service_account) service_account=gd.get("service_account", GDConfig.service_account)
) )
@ -246,9 +258,12 @@ class Config:
return json.dumps({ return json.dumps({
"config_file": self.config_file, "config_file": self.config_file,
"sheet": self.sheet, "sheet": self.sheet,
"worksheet_allow": list(self.worksheet_allow),
"worksheet_block": list(self.worksheet_block),
"storage": self.storage, "storage": self.storage,
"header": self.header, "header": self.header,
"check_if_exists": self.check_if_exists, "check_if_exists": self.check_if_exists,
"hash_algorithm": Archiver.HASH_ALGORITHM,
"save_logs": self.save_logs, "save_logs": self.save_logs,
"selenium_config": asdict(self.selenium_config), "selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None, "selenium_webdriver": self.webdriver != None,

Wyświetl plik

@ -0,0 +1,73 @@
import os.path
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
# If creating for first time download the OAuth Client Ids json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs
# add "http://localhost:55192/" to the list of "Authorised redirect URIs"
# https://davemateer.com/2022/04/28/google-drive-with-python for more information
# You can run this code to get a new token and verify it belongs to the correct user
# This token will be refresh automatically by the auto-archiver
# Code below from https://developers.google.com/drive/api/quickstart/python
SCOPES = ['https://www.googleapis.com/auth/drive']
def main():
token_file = 'gd-token.json'
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists(token_file):
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
print('Requesting new token')
creds.refresh(Request())
else:
print('First run through so putting up login dialog')
# credentials.json downloaded from https://console.cloud.google.com/apis/credentials
flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
creds = flow.run_local_server(port=55192)
# Save the credentials for the next run
with open(token_file, 'w') as token:
print('Saving new token')
token.write(creds.to_json())
else:
print('Token valid')
try:
service = build('drive', 'v3', credentials=creds)
# About the user
results = service.about().get(fields="*").execute()
emailAddress = results['user']['emailAddress']
print(emailAddress)
# Call the Drive v3 API and return some files
results = service.files().list(
pageSize=10, fields="nextPageToken, files(id, name)").execute()
items = results.get('files', [])
if not items:
print('No files found.')
return
print('Files:')
for item in items:
print(u'{0} ({1})'.format(item['name'], item['id']))
except HttpError as error:
print(f'An error occurred: {error}')
if __name__ == '__main__':
main()

Wyświetl plik

@ -18,8 +18,19 @@ secrets:
# needed if you use storage=gd # needed if you use storage=gd
google_drive: google_drive:
# local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json # To authenticate with google you have two options (1. service account OR 2. OAuth token)
service_account: "service_account.json"
# 1. service account - storage space will count towards the developer account
# filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
# service_account: "service_account.json"
# 2. OAuth token - storage space will count towards the owner of the GDrive folder
# (only 1. or 2. - if both specified then this 2. takes precedence)
# needs write access on the server so refresh flow works
# To get the token, run the file `create_update_test_oauth_token.py`
# you can edit that file if you want a different token filename, default is "gd-token.json"
oauth_token_filename: "gd-token.json"
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
# needed if you use storage=local # needed if you use storage=local
@ -65,12 +76,25 @@ secrets:
execution: execution:
# can be overwritten with CMD --sheet= # can be overwritten with CMD --sheet=
sheet: your-sheet-name sheet: your-sheet-name
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
# worksheet_allow and worksheet_block can be single values or lists
# if worksheet_allow is specified, worksheet_block is ignored
# worksheet_allow:
# - Sheet1
# - "Sheet 2"
# worksheet_block: BlockedSheet
# which row of your tabs contains the header, can be overwritten with CMD --header= # which row of your tabs contains the header, can be overwritten with CMD --header=
header: 1 header: 1
# which storage to use, can be overwritten with CMD --storage= # which storage to use, can be overwritten with CMD --storage=
storage: s3 storage: s3
# defaults to false, when true will try to avoid duplicate URL archives # defaults to false, when true will try to avoid duplicate URL archives
check_if_exists: true check_if_exists: true
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
# hash_algorithm: SHA-256
# optional configurations for the selenium browser that takes screenshots, these are the defaults # optional configurations for the selenium browser that takes screenshots, these are the defaults
selenium: selenium:
# values under 10s might mean screenshots fail to grab screenshot # values under 10s might mean screenshots fail to grab screenshot
@ -95,3 +119,4 @@ execution:
duration: duration duration: duration
screenshot: screenshot screenshot: screenshot
hash: hash hash: hash

Wyświetl plik

@ -8,19 +8,54 @@ from googleapiclient.http import MediaFileUpload
from google.oauth2 import service_account from google.oauth2 import service_account
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
@dataclass @dataclass
class GDConfig: class GDConfig:
root_folder_id: str root_folder_id: str
folder: str = "default" oauth_token_filename: str
service_account: str = "service_account.json" service_account: str = "service_account.json"
folder: str = "default"
class GDStorage(Storage): class GDStorage(Storage):
def __init__(self, config: GDConfig): def __init__(self, config: GDConfig):
self.folder = config.folder self.folder = config.folder
self.root_folder_id = config.root_folder_id self.root_folder_id = config.root_folder_id
creds = service_account.Credentials.from_service_account_file(
config.service_account, scopes=['https://www.googleapis.com/auth/drive']) SCOPES=['https://www.googleapis.com/auth/drive']
token_file = config.oauth_token_filename
if token_file is not None:
"""
Tokens are refreshed after 1 hour
however keep working for 7 days (tbc)
so as long as the job doesn't last for 7 days
then this method of refreshing only once per run will work
see this link for details on the token
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
"""
logger.debug(f'Using GD OAuth token {token_file}')
creds = Credentials.from_authorized_user_file(token_file, SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
logger.debug('Requesting new GD OAuth token')
creds.refresh(Request())
else:
raise Exception("Problem with creds - create the token again")
# Save the credentials for the next run
with open(token_file, 'w') as token:
logger.debug('Saving new GD OAuth token')
token.write(creds.to_json())
else:
logger.debug('GD OAuth Token valid')
else:
gd_service_account = config.service_account
logger.debug(f'Using GD Service Account {gd_service_account}')
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
self.service = build('drive', 'v3', credentials=creds) self.service = build('drive', 'v3', credentials=creds)
def get_cdn_url(self, key): def get_cdn_url(self, key):
@ -28,6 +63,8 @@ class GDStorage(Storage):
only support files saved in a folder for GD only support files saved in a folder for GD
S3 supports folder and all stored in the root S3 supports folder and all stored in the root
""" """
key = self.clean_key(key)
full_name = os.path.join(self.folder, key) full_name = os.path.join(self.folder, key)
parent_id, folder_id = self.root_folder_id, None parent_id, folder_id = self.root_folder_id, None
path_parts = full_name.split(os.path.sep) path_parts = full_name.split(os.path.sep)
@ -52,6 +89,8 @@ class GDStorage(Storage):
1. for each sub-folder in the path check if exists or create 1. for each sub-folder in the path check if exists or create
2. upload file to root_id/other_paths.../filename 2. upload file to root_id/other_paths.../filename
""" """
key = self.clean_key(key)
full_name = os.path.join(self.folder, key) full_name = os.path.join(self.folder, key)
parent_id, upload_to = self.root_folder_id, None parent_id, upload_to = self.root_folder_id, None
path_parts = full_name.split(os.path.sep) path_parts = full_name.split(os.path.sep)
@ -77,13 +116,21 @@ class GDStorage(Storage):
# GD only requires the filename not a file reader # GD only requires the filename not a file reader
self.uploadf(filename, key, **kwargs) self.uploadf(filename, key, **kwargs)
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True): def clean_key(self, key):
# GDrive does not work well with trailing forward slashes and some keys come with that
if key.startswith('/'):
logger.debug(f'Found and fixed a leading "/" for {key=}')
return key[1:]
return key
# gets the Drive folderID if it is there
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
""" """
Retrieves the id of a folder or file from its @name and the @parent_id folder Retrieves the id of a folder or file from its @name and the @parent_id folder
Optionally does multiple @retries and sleeps @sleep_seconds between them Optionally does multiple @retries and sleeps @sleep_seconds between them
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
If @raise_on_missing will throw error when not found, or returns None If @raise_on_missing will throw error when not found, or returns None
Will remember previous calls to avoid duplication if @use_cache Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
Returns the id of the file or folder from its name as a string Returns the id of the file or folder from its name as a string
""" """
# cache logic # cache logic
@ -96,7 +143,7 @@ class GDStorage(Storage):
# API logic # API logic
debug_header: str = f"[searching {name=} in {parent_id=}]" debug_header: str = f"[searching {name=} in {parent_id=}]"
query_string = f"'{parent_id}' in parents and name = '{name}' " query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
if use_mime_type: if use_mime_type:
query_string += f" and mimeType='application/vnd.google-apps.folder' " query_string += f" and mimeType='application/vnd.google-apps.folder' "

Wyświetl plik

@ -1,4 +1,4 @@
import uuid, os import uuid, os, mimetypes
from dataclasses import dataclass from dataclasses import dataclass
import boto3 import boto3
@ -21,6 +21,7 @@ class S3Config:
private: bool = False private: bool = False
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
class S3Storage(Storage): class S3Storage(Storage):
def __init__(self, config: S3Config): def __init__(self, config: S3Config):
@ -70,4 +71,5 @@ class S3Storage(Storage):
extra_args = kwargs.get("extra_args", {}) extra_args = kwargs.get("extra_args", {})
else: else:
extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
extra_args['ContentType'] = mimetypes.guess_type(key)[0]
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)