Merge pull request #58 from bellingcat/dev

2022-09-21 18:53:13 +02:00 · 2022-09-21 18:53:13 +02:00 · 0bd9e043ed
commit 0bd9e043ed
--- a/.gitignore
+++ b/.gitignore
@ -16,4 +16,7 @@ config.yaml
 config-*.yaml
 logs/*
 local_archive/
-vk_config*.json
+vk_config*.json
+gd-token.json
+credentials.json
+secrets/*
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@ -26,8 +26,8 @@ class ArchiveResult:
    screenshot: str = None
    hash: str = None

-
 class Archiver(ABC):
+    HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
    name = "default"
    retry_regex = r"retrying at (\d+)$"

@ -47,7 +47,6 @@ class Archiver(ABC):
    def get_netloc(self, url):
        return urlparse(url).netloc

-    # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
    def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
        """
        Generates an index.html page where each @urls_info is displayed
@ -163,10 +162,12 @@ class Archiver(ABC):
    def get_hash(self, filename):
        with open(filename, "rb") as f:
            bytes = f.read()  # read entire file as bytes
-            # TODO: customizable hash
-            hash = hashlib.sha256(bytes)
-            # option to use SHA3_512 instead
-            # hash = hashlib.sha3_512(bytes)
+            logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
+
+            if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
+            elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
+            else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
+
        return hash.hexdigest()

    def get_screenshot(self, url):
--- a/archivers/twitter_api_archiver.py
+++ b/archivers/twitter_api_archiver.py
@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver):

            for u in urls:
                if u is None:
-                    logger.error(f"Should not have gotten None url for {tweet.includes.media=}")
+                    logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
                    return self.download_alternative(url, tweet_id)
        logger.debug(f"found {urls=}")

--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@ -5,12 +5,12 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo

 from .base_archiver import Archiver, ArchiveResult

-
 class TwitterArchiver(Archiver):
    """
    This Twitter Archiver uses unofficial scraping methods, and it works as 
    an alternative to TwitterApiArchiver when no API credentials are provided.
    """
+
    name = "twitter"
    link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

--- a/auto_archive.py
+++ b/auto_archive.py
@ -53,11 +53,25 @@ def missing_required_columns(gw: GWorksheet):
    return missing


+def should_process_sheet(c, sheet_name):
+    if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
+        # ALLOW rules exist AND sheet name not explicitly allowed
+        return False
+    if len(c.worksheet_block) and sheet_name in c.worksheet_block:
+        # BLOCK rules exist AND sheet name is blocked
+        return False
+    return True
+
+
 def process_sheet(c: Config):
    sh = c.gsheets_client.open(c.sheet)

    # loop through worksheets to check
    for ii, wks in enumerate(sh.worksheets()):
+        if not should_process_sheet(c, wks.title):
+            logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
+            continue
+
        logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
        gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)

@ -80,7 +94,7 @@ def process_sheet(c: Config):
                if not is_retry: continue

            # All checks done - archival process starts here
-            try: 
+            try:
                gw.set_cell(row, 'status', 'Archive in progress')
                url = expand_url(url)
                c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
@ -96,7 +110,7 @@ def process_sheet(c: Config):
                    YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
                    TelegramArchiver(storage, c.webdriver),
                    TwitterArchiver(storage, c.webdriver),
-                    VkArchiver(storage,  c.webdriver, c.vk_config),
+                    VkArchiver(storage, c.webdriver, c.vk_config),
                    WaybackArchiver(storage, c.webdriver, c.wayback_config)
                ]

@ -105,7 +119,7 @@ def process_sheet(c: Config):

                    try:
                        result = archiver.download(url, check_if_exists=c.check_if_exists)
-                    except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
+                    except KeyboardInterrupt as e: raise e  # so the higher level catch can catch it
                    except Exception as e:
                        result = False
                        logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
--- a/configs/config.py
+++ b/configs/config.py
@ -1,5 +1,6 @@

 import argparse, yaml, json
+from archivers.base_archiver import Archiver
 import gspread
 from loguru import logger
 from selenium import webdriver
@ -50,6 +51,14 @@ class Config:

        self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
        assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
+
+        def ensure_set(l):
+            # always returns a set of strings, can receive a set or a string
+            l = l if isinstance(l, list) else [l]
+            return set([x for x in l if isinstance(x, str) and len(x) > 0])
+        self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
+        self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
+
        self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
        self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
        self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
@ -73,6 +82,8 @@ class Config:
        )
        self.webdriver = "not initialized"

+        Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
+
        # ---------------------- SECRETS - APIs and service configurations
        secrets = self.config.get("secrets", {})

@ -107,6 +118,7 @@ class Config:
            gd = secrets["google_drive"]
            self.gd_config = GDConfig(
                root_folder_id=gd.get("root_folder_id"),
+                oauth_token_filename=gd.get("oauth_token_filename"),
                service_account=gd.get("service_account", GDConfig.service_account)
            )

@ -246,9 +258,12 @@ class Config:
        return json.dumps({
            "config_file": self.config_file,
            "sheet": self.sheet,
+            "worksheet_allow": list(self.worksheet_allow),
+            "worksheet_block": list(self.worksheet_block),
            "storage": self.storage,
            "header": self.header,
            "check_if_exists": self.check_if_exists,
+            "hash_algorithm": Archiver.HASH_ALGORITHM,
            "save_logs": self.save_logs,
            "selenium_config": asdict(self.selenium_config),
            "selenium_webdriver": self.webdriver != None,
--- a/create_update_test_oauth_token.py
+++ b/create_update_test_oauth_token.py
@ -0,0 +1,73 @@
+import os.path
+
+from google.auth.transport.requests import Request
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+
+# If creating for first time download the OAuth Client Ids json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs
+# add "http://localhost:55192/" to the list of "Authorised redirect URIs"
+# https://davemateer.com/2022/04/28/google-drive-with-python for more information
+
+# You can run this code to get a new token and verify it belongs to the correct user
+# This token will be refresh automatically by the auto-archiver
+
+# Code below from https://developers.google.com/drive/api/quickstart/python
+
+SCOPES = ['https://www.googleapis.com/auth/drive']
+
+
+def main():
+    token_file = 'gd-token.json'
+    creds = None
+
+    # The file token.json stores the user's access and refresh tokens, and is
+    # created automatically when the authorization flow completes for the first
+    # time.
+    if os.path.exists(token_file):
+        creds = Credentials.from_authorized_user_file(token_file, SCOPES)
+
+    # If there are no (valid) credentials available, let the user log in.
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            print('Requesting new token')
+            creds.refresh(Request())
+        else:
+            print('First run through so putting up login dialog')
+            # credentials.json downloaded from https://console.cloud.google.com/apis/credentials
+            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
+            creds = flow.run_local_server(port=55192)
+        # Save the credentials for the next run
+        with open(token_file, 'w') as token:
+            print('Saving new token')
+            token.write(creds.to_json())
+    else:
+        print('Token valid')
+
+    try:
+        service = build('drive', 'v3', credentials=creds)
+
+        # About the user
+        results = service.about().get(fields="*").execute()
+        emailAddress = results['user']['emailAddress']
+        print(emailAddress)
+
+        # Call the Drive v3 API and return some files
+        results = service.files().list(
+            pageSize=10, fields="nextPageToken, files(id, name)").execute()
+        items = results.get('files', [])
+
+        if not items:
+            print('No files found.')
+            return
+        print('Files:')
+        for item in items:
+            print(u'{0} ({1})'.format(item['name'], item['id']))
+
+    except HttpError as error:
+        print(f'An error occurred: {error}')
+
+
+if __name__ == '__main__':
+    main()
--- a/example.config.yaml
+++ b/example.config.yaml
@ -18,8 +18,19 @@ secrets:

  # needed if you use storage=gd
  google_drive:
-    # local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json
-    service_account: "service_account.json"
+    # To authenticate with google you have two options (1. service account OR 2. OAuth token)
+
+    # 1. service account - storage space will count towards the developer account
+    # filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
+    # service_account: "service_account.json"
+
+    # 2. OAuth token  - storage space will count towards the owner of the GDrive folder
+    # (only 1. or 2. - if both specified then this 2. takes precedence)
+    # needs write access on the server so refresh flow works
+    # To get the token, run the file `create_update_test_oauth_token.py`
+    # you can edit that file if you want a different token filename, default is "gd-token.json"
+    oauth_token_filename: "gd-token.json"
+
    root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX

  # needed if you use storage=local
@ -65,12 +76,25 @@ secrets:
 execution:
  # can be overwritten with CMD --sheet=
  sheet: your-sheet-name
+
+  # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
+  # worksheet_allow and worksheet_block can be single values or lists
+  # if worksheet_allow is specified, worksheet_block is ignored
+  # worksheet_allow:
+  #   - Sheet1
+  #   - "Sheet 2"
+  # worksheet_block: BlockedSheet
+
  # which row of your tabs contains the header, can be overwritten with CMD --header=
  header: 1
  # which storage to use, can be overwritten with CMD --storage=
  storage: s3
  # defaults to false, when true will try to avoid duplicate URL archives
  check_if_exists: true
+
+  # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
+  # hash_algorithm: SHA-256
+
  # optional configurations for the selenium browser that takes screenshots, these are the defaults
  selenium:
    # values under 10s might mean screenshots fail to grab screenshot
@ -95,3 +119,4 @@ execution:
    duration: duration
    screenshot: screenshot
    hash: hash
+
--- a/storages/gd_storage.py
+++ b/storages/gd_storage.py
@ -8,19 +8,54 @@ from googleapiclient.http import MediaFileUpload
 from google.oauth2 import service_account


+from google.oauth2.credentials import Credentials
+from google.auth.transport.requests import Request
+
@dataclass
 class GDConfig:
    root_folder_id: str
-    folder: str = "default"
+    oauth_token_filename: str
    service_account: str = "service_account.json"
-
+    folder: str = "default"

 class GDStorage(Storage):
    def __init__(self, config: GDConfig):
        self.folder = config.folder
        self.root_folder_id = config.root_folder_id
-        creds = service_account.Credentials.from_service_account_file(
-            config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
+        
+        SCOPES=['https://www.googleapis.com/auth/drive']
+        
+        token_file = config.oauth_token_filename
+        if token_file is not None:
+            """
+            Tokens are refreshed after 1 hour 
+            however keep working for 7 days (tbc)
+            so as long as the job doesn't last for 7 days
+            then this method of refreshing only once per run will work
+            see this link for details on the token
+            https://davemateer.com/2022/04/28/google-drive-with-python#tokens
+            """
+            logger.debug(f'Using GD OAuth token {token_file}')
+            creds = Credentials.from_authorized_user_file(token_file, SCOPES)
+
+            if not creds or not creds.valid:
+                if creds and creds.expired and creds.refresh_token:
+                    logger.debug('Requesting new GD OAuth token')
+                    creds.refresh(Request())
+                else:
+                    raise Exception("Problem with creds - create the token again")
+
+                # Save the credentials for the next run
+                with open(token_file, 'w') as token:
+                    logger.debug('Saving new GD OAuth token')
+                    token.write(creds.to_json())
+            else:
+                logger.debug('GD OAuth Token valid')
+        else:
+            gd_service_account = config.service_account
+            logger.debug(f'Using GD Service Account {gd_service_account}')
+            creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
+
        self.service = build('drive', 'v3', credentials=creds)

    def get_cdn_url(self, key):
@ -28,6 +63,8 @@ class GDStorage(Storage):
        only support files saved in a folder for GD
        S3 supports folder and all stored in the root
        """
+        key = self.clean_key(key)
+
        full_name = os.path.join(self.folder, key)
        parent_id, folder_id = self.root_folder_id, None
        path_parts = full_name.split(os.path.sep)
@ -52,6 +89,8 @@ class GDStorage(Storage):
        1. for each sub-folder in the path check if exists or create
        2. upload file to root_id/other_paths.../filename
        """
+        key = self.clean_key(key)
+
        full_name = os.path.join(self.folder, key)
        parent_id, upload_to = self.root_folder_id, None
        path_parts = full_name.split(os.path.sep)
@ -77,13 +116,21 @@ class GDStorage(Storage):
        # GD only requires the filename not a file reader
        self.uploadf(filename, key, **kwargs)

-    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
+    def clean_key(self, key):
+        # GDrive does not work well with trailing forward slashes and some keys come with that
+        if key.startswith('/'):
+            logger.debug(f'Found and fixed a leading "/" for {key=}')
+            return key[1:]
+        return key
+
+    # gets the Drive folderID if it is there
+    def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
        """
        Retrieves the id of a folder or file from its @name and the @parent_id folder
        Optionally does multiple @retries and sleeps @sleep_seconds between them
        If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
        If @raise_on_missing will throw error when not found, or returns None
-        Will remember previous calls to avoid duplication if @use_cache
+        Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
        Returns the id of the file or folder from its name as a string
        """
        # cache logic
@ -96,7 +143,7 @@ class GDStorage(Storage):

        # API logic
        debug_header: str = f"[searching {name=} in {parent_id=}]"
-        query_string = f"'{parent_id}' in parents and name = '{name}' "
+        query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
        if use_mime_type:
            query_string += f" and mimeType='application/vnd.google-apps.folder' "

--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@ -1,4 +1,4 @@
-import uuid, os
+import uuid, os, mimetypes
 from dataclasses import dataclass

 import boto3
@ -21,6 +21,7 @@ class S3Config:
    private: bool = False
    key_path: str = "default"  # 'default' uses full naming, 'random' uses generated uuid

+
 class S3Storage(Storage):

    def __init__(self, config: S3Config):
@ -70,4 +71,5 @@ class S3Storage(Storage):
            extra_args = kwargs.get("extra_args", {})
        else:
            extra_args = kwargs.get("extra_args", {'ACL': 'public-read'})
+        extra_args['ContentType'] = mimetypes.guess_type(key)[0]
        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)