From ca44a40b88f75fb2b078cf2e92057bfd937788dd Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Mar 2025 19:03:45 +0000 Subject: [PATCH] Ruff fix on src. --- .pre-commit-config.yaml | 7 +++++++ src/auto_archiver/core/base_module.py | 4 ++-- src/auto_archiver/core/config.py | 3 +-- src/auto_archiver/core/extractor.py | 3 --- src/auto_archiver/core/metadata.py | 2 +- src/auto_archiver/core/orchestrator.py | 2 +- .../modules/gdrive_storage/gdrive_storage.py | 2 +- .../generic_extractor/generic_extractor.py | 5 +++-- .../modules/generic_extractor/twitter.py | 6 ++++-- .../modules/gsheet_feeder_db/gsheet_feeder_db.py | 2 +- .../modules/html_formatter/html_formatter.py | 4 +++- .../instagram_api_extractor.py | 16 ++++++++-------- .../instagram_extractor/instagram_extractor.py | 8 +++++--- .../instagram_tbot_extractor.py | 4 ++-- .../screenshot_enricher/screenshot_enricher.py | 3 ++- .../modules/ssl_enricher/ssl_enricher.py | 3 ++- .../telegram_extractor/telegram_extractor.py | 4 +++- .../telethon_extractor/telethon_extractor.py | 8 +++++--- .../thumbnail_enricher/thumbnail_enricher.py | 3 ++- .../wacz_extractor_enricher.py | 5 +++-- .../wayback_extractor_enricher.py | 7 ++++--- .../modules/whisper_enricher/whisper_enricher.py | 5 +++-- src/auto_archiver/utils/webdriver.py | 6 +++--- 23 files changed, 66 insertions(+), 46 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0fdf695 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.10 + hooks: + - id: ruff +# args: [ --fix ] + - id: ruff-format diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index d809e59..d717e4b 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -1,8 +1,8 @@ from __future__ import annotations -from typing import Mapping, Any, Type, TYPE_CHECKING +from typing import Mapping, Any, TYPE_CHECKING from abc import ABC -from copy import deepcopy, copy +from copy import deepcopy from tempfile import TemporaryDirectory from auto_archiver.utils import url as UrlUtil from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 8122809..f9e8c17 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -6,7 +6,7 @@ flexible setup in various environments. """ import argparse -from ruamel.yaml import YAML, CommentedMap, add_representer +from ruamel.yaml import YAML, CommentedMap import json from loguru import logger @@ -14,7 +14,6 @@ from loguru import logger from copy import deepcopy from auto_archiver.core.consts import MODULE_TYPES -from typing import Any, List, Type, Tuple _yaml: YAML = YAML() diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 8ad13f5..cf42f1e 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -7,12 +7,9 @@ Factory method to initialize an extractor instance based on its name. """ from __future__ import annotations -from pathlib import Path from abc import abstractmethod -from dataclasses import dataclass import mimetypes import os -import mimetypes import requests from loguru import logger from retrying import retry diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 9c696a2..7961981 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -13,7 +13,7 @@ from __future__ import annotations import hashlib from typing import Any, List, Union, Dict from dataclasses import dataclass, field -from dataclasses_json import dataclass_json, config +from dataclasses_json import dataclass_json import datetime from urllib.parse import urlparse from dateutil.parser import parse as parse_dt diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ba00995..6200b0a 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -123,7 +123,7 @@ Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_typ ) if module_type == "extractor" and config["steps"].get("archivers"): raise SetupError( - f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \ + "As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n" ) raise SetupError( diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index f01ea4e..02ec427 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -135,7 +135,7 @@ class GDriveStorage(Storage): debug_header: str = f"[searching {name=} in {parent_id=}]" query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false " if use_mime_type: - query_string += f" and mimeType='application/vnd.google-apps.folder' " + query_string += " and mimeType='application/vnd.google-apps.folder' " for attempt in range(retries): results = ( diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 107ce93..08118ad 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -1,4 +1,5 @@ -import datetime, os +import datetime +import os import importlib import subprocess from typing import Generator, Type @@ -386,7 +387,7 @@ class GenericExtractor(Extractor): item.set("replaced_url", url) ydl_options = { - "outtmpl": os.path.join(self.tmp_dir, f"%(id)s.%(ext)s"), + "outtmpl": os.path.join(self.tmp_dir, "%(id)s.%(ext)s"), "quiet": False, "noplaylist": not self.allow_playlist, "writesubtitles": self.subtitles, diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 5b8468c..e4cbe74 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -1,4 +1,6 @@ -import re, mimetypes, json +import re +import mimetypes +import json from datetime import datetime from loguru import logger @@ -35,7 +37,7 @@ class Twitter(GenericDropin): result = Metadata() try: if not tweet.get("user") or not tweet.get("created_at"): - raise ValueError(f"Error retreiving post. Are you sure it exists?") + raise ValueError("Error retreiving post. Are you sure it exists?") timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py index 4a9c9b3..109be3f 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -20,7 +20,7 @@ from slugify import slugify from auto_archiver.core import Feeder, Database, Media from auto_archiver.core import Metadata from auto_archiver.modules.gsheet_feeder_db import GWorksheet -from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp +from auto_archiver.utils.misc import get_current_timestamp class GsheetsFeederDB(Feeder, Database): diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 88a9eca..f5da1d8 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -1,5 +1,7 @@ from __future__ import annotations -import mimetypes, os, pathlib +import mimetypes +import os +import pathlib from jinja2 import Environment, FileSystemLoader from urllib.parse import quote from loguru import logger diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index bb37df2..bae06bc 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -95,7 +95,7 @@ class InstagramAPIExtractor(Extractor): result.set_title(user.get("full_name", username)).set("data", user) if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")): filename = self.download_from_url(pic_url) - result.add_media(Media(filename=filename), id=f"profile_picture") + result.add_media(Media(filename=filename), id="profile_picture") if self.full_profile: user_id = user.get("pk") @@ -133,7 +133,7 @@ class InstagramAPIExtractor(Extractor): def download_all_highlights(self, result, username, user_id): count_highlights = 0 - highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id}) + highlights = self.call_api("v1/user/highlights", {"user_id": user_id}) for h in highlights: try: h_info = self._download_highlights_reusable(result, h.get("pk")) @@ -151,9 +151,9 @@ class InstagramAPIExtractor(Extractor): def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata: if id: - post = self.call_api(f"v1/media/by/id", {"id": id}) + post = self.call_api("v1/media/by/id", {"id": id}) else: - post = self.call_api(f"v1/media/by/code", {"code": code}) + post = self.call_api("v1/media/by/code", {"code": code}) assert post, f"Post {id or code} not found" if caption_text := post.get("caption_text"): @@ -173,7 +173,7 @@ class InstagramAPIExtractor(Extractor): return result.success("insta highlights") def _download_highlights_reusable(self, result: Metadata, id: str) -> dict: - full_h = self.call_api(f"v2/highlight/by/id", {"id": id}) + full_h = self.call_api("v2/highlight/by/id", {"id": id}) h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}") assert h_info, f"Highlight {id} not found: {full_h=}" @@ -200,7 +200,7 @@ class InstagramAPIExtractor(Extractor): return result.success(f"insta stories {now}") def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]: - stories = self.call_api(f"v1/user/stories/by/username", {"username": username}) + stories = self.call_api("v1/user/stories/by/username", {"username": username}) if not stories or not len(stories): return [] stories = stories[::-1] # newest to oldest @@ -219,7 +219,7 @@ class InstagramAPIExtractor(Extractor): post_count = 0 while end_cursor != "": - posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}) + posts = self.call_api("v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}) if not len(posts) or not type(posts) == list or len(posts) != 2: break posts, end_cursor = posts[0], posts[1] @@ -244,7 +244,7 @@ class InstagramAPIExtractor(Extractor): tagged_count = 0 while next_page_id != None: - resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}) + resp = self.call_api("v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}) posts = resp.get("response", {}).get("items", []) if not len(posts): break diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index f310771..294b4e7 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -4,7 +4,9 @@ highlights, and tagged posts. Authentication is required via username/password o """ -import re, os, shutil +import re +import os +import shutil import instaloader from loguru import logger @@ -36,9 +38,9 @@ class InstagramExtractor(Extractor): ) try: self.insta.load_session_from_file(self.username, self.session_file) - except Exception as e: + except Exception: try: - logger.debug(f"Session file failed", exc_info=True) + logger.debug("Session file failed", exc_info=True) logger.info("No valid session file found - Attempting login with use and password.") self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 39ed893..81d2bf6 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -51,7 +51,7 @@ class InstagramTbotExtractor(Extractor): """Initializes the Telegram client.""" try: self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) - except OperationalError as e: + except OperationalError: logger.error( f"Unable to access the {self.session_file} session. " "Ensure that you don't use the same session file here and in telethon_extractor. " @@ -68,7 +68,7 @@ class InstagramTbotExtractor(Extractor): def download(self, item: Metadata) -> Metadata: url = item.get_url() - if not "instagram.com" in url: + if "instagram.com" not in url: return False result = Metadata() diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 9fa2d62..491bd51 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -1,5 +1,6 @@ from loguru import logger -import time, os +import time +import os import base64 from selenium.common.exceptions import TimeoutException diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 74d80ce..3ab1389 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -1,4 +1,5 @@ -import ssl, os +import ssl +import os from slugify import slugify from urllib.parse import urlparse from loguru import logger diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index 5184024..e63fb8d 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -1,4 +1,6 @@ -import requests, re, html +import requests +import re +import html from bs4 import BeautifulSoup from loguru import logger diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index be878a2..b06962e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -10,7 +10,9 @@ from telethon.errors.rpcerrorlist import ( ) from loguru import logger from tqdm import tqdm -import re, time, os +import re +import time +import os from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media @@ -63,11 +65,11 @@ class TelethonExtractor(Extractor): logger.warning( f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting." ) - except ValueError as e: + except ValueError: logger.info(f"joining new channel {invite=}") try: self.client(ImportChatInviteRequest(match.group(2))) - except UserAlreadyParticipantError as e: + except UserAlreadyParticipantError: logger.info(f"already joined {invite=}") except InviteRequestSentError: logger.warning(f"already sent a join request with {invite} still no answer") diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 2f50c6b..1543cec 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -7,7 +7,8 @@ and identify important moments without watching the entire video. """ -import ffmpeg, os +import ffmpeg +import os from loguru import logger from auto_archiver.core import Enricher diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py index ec61572..975d49a 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py @@ -1,6 +1,8 @@ import jsonlines import mimetypes -import os, shutil, subprocess +import os +import shutil +import subprocess from zipfile import ZipFile from loguru import logger from warcio.archiveiterator import ArchiveIterator @@ -186,7 +188,6 @@ class WaczExtractorEnricher(Enricher, Extractor): # get media out of .warc counter = 0 seen_urls = set() - import json with open(warc_filename, "rb") as warc_stream: for record in ArchiveIterator(warc_stream): diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py index 2dc3545..f06effd 100644 --- a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py +++ b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py @@ -1,6 +1,7 @@ import json from loguru import logger -import time, requests +import time +import requests from auto_archiver.core import Extractor, Enricher from auto_archiver.utils import url as UrlUtil @@ -57,7 +58,7 @@ class WaybackExtractorEnricher(Enricher, Extractor): if not job_id: logger.error(f"Wayback failed with {r.json()}") return False - except json.decoder.JSONDecodeError as e: + except json.decoder.JSONDecodeError: logger.error(f"Expected a JSON with job_id from Wayback and got {r.text}") return False @@ -80,7 +81,7 @@ class WaybackExtractorEnricher(Enricher, Extractor): except requests.exceptions.RequestException as e: logger.warning(f"RequestException: fetching status for {url=} due to: {e}") break - except json.decoder.JSONDecodeError as e: + except json.decoder.JSONDecodeError: logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}") break except Exception as e: diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 0c884bb..d2205e2 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -1,5 +1,6 @@ import traceback -import requests, time +import requests +import time from loguru import logger from auto_archiver.core import Enricher @@ -16,7 +17,7 @@ class WhisperEnricher(Enricher): def setup(self) -> None: self.stores = self.config["steps"]["storages"] self.s3 = self.module_factory.get_module("s3_storage", self.config) - if not "s3_storage" in self.stores: + if "s3_storage" not in self.stores: logger.error( "WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called." ) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index 0690ab5..2ba185e 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -66,15 +66,15 @@ class CookieSettingDriver(webdriver.Firefox): if self.facebook_accept_cookies: try: - logger.debug(f"Trying fb click accept cookie popup.") + logger.debug("Trying fb click accept cookie popup.") super(CookieSettingDriver, self).get("http://www.facebook.com") essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]") essential_only.click() - logger.debug(f"fb click worked") + logger.debug("fb click worked") # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page time.sleep(2) except Exception as e: - logger.warning(f"Failed on fb accept cookies.", e) + logger.warning("Failed on fb accept cookies.", e) # now get the actual URL super(CookieSettingDriver, self).get(url)