Add header argument; set up webdriver

2022-02-25 16:09:35 +01:00 · 2022-02-25 16:09:35 +01:00 · 63a2847ac9
commit 63a2847ac9
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@ -43,6 +43,7 @@ class Archiver(ABC):
    def get_netloc(self, url):
        return urlparse(url).netloc

+
    def get_key(self, filename):
        """
        returns a key in the format "[archiverName]_[filename]" includes extension
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@ -55,8 +55,8 @@ class TelegramArchiver(Archiver):
        # extract duration from HTML
        duration = s.find_all('time')[0].contents[0]
        if ':' in duration:
-            duration = float(duration.split(':')[0]) * 60
-            + float(duration.split(':')[1])
+            duration = float(duration.split(
+                ':')[0]) * 60 + float(duration.split(':')[1])
        else:
            duration = float(duration)

--- a/auto_archive.py
+++ b/auto_archive.py
@ -6,6 +6,7 @@ import shutil
 import gspread
 from loguru import logger
 from dotenv import load_dotenv
+from selenium import webdriver

 import archivers
 from storages import S3Storage, S3Config
@ -27,13 +28,17 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):

    batch_if_valid('archive', result.cdn_url)
    batch_if_valid('date', True, datetime.datetime.now().isoformat())
-    batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
+    batch_if_valid('thumbnail', result.thumbnail,
+                   f'=IMAGE("{result.thumbnail}")')
    batch_if_valid('thumbnail_index', result.thumbnail_index)
    batch_if_valid('title', result.title)
    batch_if_valid('duration', result.duration, str(result.duration))
+    batch_if_valid('screenshot', result.screenshot)
+    batch_if_valid('hash', result.hash)

    if result.timestamp and type(result.timestamp) != str:
-        result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
+        result.timestamp = datetime.datetime.fromtimestamp(
+            result.timestamp).isoformat()
    batch_if_valid('timestamp', result.timestamp)

    gw.batch_set_cell(cell_updates)
@ -50,7 +55,7 @@ def expand_url(url):
    return url


-def process_sheet(sheet):
+def process_sheet(sheet, header=1):
    gc = gspread.service_account(filename='service_account.json')
    sh = gc.open(sheet)

@ -61,73 +66,97 @@ def process_sheet(sheet):
        secret=os.getenv('DO_SPACES_SECRET')
    )

+    driver = webdriver.Firefox()
+    driver.set_window_size(1400, 2000)
+
    # loop through worksheets to check
    for ii, wks in enumerate(sh.worksheets()):
        logger.info(f'Opening worksheet {ii}: "{wks.title}"')
-        gw = GWorksheet(wks)
+        gw = GWorksheet(wks, header_row=header)

        if not gw.col_exists('url'):
-            logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
+            logger.warning(
+                f'No "Media URL" column found, skipping worksheet {wks.title}')
            continue

        if not gw.col_exists('status'):
-            logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}')
+            logger.warning(
+                f'No "Archive status" column found, skipping worksheet {wks.title}')
            continue

        # archives will be in a folder 'doc_name/worksheet_name'
-        s3_config.folder = f'{sheet}/{wks.title}/'
+        s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
        s3_client = S3Storage(s3_config)

        # order matters, first to succeed excludes remaining
        active_archivers = [
-            archivers.TelegramArchiver(s3_client),
-            archivers.TiktokArchiver(s3_client),
-            archivers.YoutubeDLArchiver(s3_client),
-            archivers.WaybackArchiver(s3_client)
+            archivers.TelegramArchiver(s3_client, driver),
+            archivers.TiktokArchiver(s3_client, driver),
+            archivers.YoutubeDLArchiver(s3_client, driver),
+            archivers.TwitterArchiver(s3_client, driver),
+            archivers.WaybackArchiver(s3_client, driver)
        ]

+        values = gw.get_values()
        # loop through rows in worksheet
-        for row in range(2, gw.count_rows() + 1):
-            url = gw.get_cell(row, 'url')
-            status = gw.get_cell(row, 'status')
+        for row in range(1 + header, gw.count_rows() + 1):
+            row_values = values[row-1]
+            url = gw.get_cell(row_values, 'url')
+            status = gw.get_cell(row_values, 'status')
            if url != '' and status in ['', None]:
-                gw.set_cell(row, 'status', 'Archive in progress')
+                url = gw.get_cell(row, 'url')
+                status = gw.get_cell(status, 'status')

-                url = expand_url(url)
+                if url != '' and status in ['', None]:
+                    gw.set_cell(row, 'status', 'Archive in progress')

-                for archiver in active_archivers:
-                    logger.debug(f'Trying {archiver} on row {row}')
+                    url = expand_url(url)

-                    # TODO: add support for multiple videos/images
-                    try:
-                        result = archiver.download(url, check_if_exists=True)
-                    except Exception as e:
-                        result = False
-                        logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
+                    for archiver in active_archivers:
+                        logger.debug(f'Trying {archiver} on row {row}')
+
+                        # TODO: add support for multiple videos/images
+                        # try:
+                        result = archiver.download(
+                            url, check_if_exists=True)
+                        # except Exception as e:
+                        #     result = False
+                        #     logger.error(
+                        #         f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
+
+                        if result:
+                            if result.status in ['success', 'already archived']:
+                                result.status = archiver.name + \
+                                    ": " + str(result.status)
+                                logger.success(
+                                    f'{archiver} succeeded on row {row}')
+                                break
+                            logger.warning(
+                                f'{archiver} did not succeed on row {row}, final status: {result.status}')
+                            result.status = archiver.name + \
+                                ": " + str(result.status)

                    if result:
-                        if result.status in ['success', 'already archived']:
-                            logger.success(f'{archiver} succeeded on row {row}')
-                            break
-                        logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
+                        update_sheet(gw, row, result)
+                    else:
+                        gw.set_cell(row, 'status', 'failed: no archiver')

-                if result:
-                    update_sheet(gw, row, result)
-                else:
-                    gw.set_cell(row, 'status', 'failed: no archiver')
+    driver.quit()


 def main():
    parser = argparse.ArgumentParser(
        description='Automatically archive social media videos from a Google Sheets document')
    parser.add_argument('--sheet', action='store', dest='sheet')
+    parser.add_argument('--header', action='store', dest='header', default=1, type=int)
    args = parser.parse_args()

    logger.info(f'Opening document {args.sheet}')

    mkdir_if_not_exists('tmp')
-    process_sheet(args.sheet)
+    process_sheet(args.sheet, header=args.header)
    shutil.rmtree('tmp')

+
 if __name__ == '__main__':
    main()
--- a/utils/gworksheet.py
+++ b/utils/gworksheet.py
@ -3,7 +3,7 @@ from gspread import utils

 class GWorksheet:
    COLUMN_NAMES = {
-        'url': 'media url',
+        'url': 'link',
        'archive': 'archive location',
        'date': 'archive date',
        'status': 'archive status',
@ -11,12 +11,14 @@ class GWorksheet:
        'thumbnail_index': 'thumbnail index',
        'timestamp': 'upload timestamp',
        'title': 'upload title',
-        'duration': 'duration'
+        'duration': 'duration',
+        'screenshot': 'screenshot',
+        'hash': 'hash'
    }

-    def __init__(self, worksheet, columns=COLUMN_NAMES):
+    def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
        self.wks = worksheet
-        self.headers = [v.lower() for v in self.wks.row_values(1)]
+        self.headers = [v.lower() for v in self.wks.row_values(header_row)]
        self.columns = columns

    def _check_col_exists(self, col: str):
@ -38,6 +40,9 @@ class GWorksheet:
        # row is 1-based
        return self.wks.row_values(row)

+    def get_values(self):
+        return self.wks.get_values()
+
    def get_cell(self, row, col: str):
        """
        returns the cell value from (row, col),