kopia lustrzana https://github.com/bellingcat/auto-archiver
Ruff fix on src.
rodzic
85abe1837a
commit
ca44a40b88
|
@ -0,0 +1,7 @@
|
|||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.10
|
||||
hooks:
|
||||
- id: ruff
|
||||
# args: [ --fix ]
|
||||
- id: ruff-format
|
|
@ -1,8 +1,8 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Mapping, Any, Type, TYPE_CHECKING
|
||||
from typing import Mapping, Any, TYPE_CHECKING
|
||||
from abc import ABC
|
||||
from copy import deepcopy, copy
|
||||
from copy import deepcopy
|
||||
from tempfile import TemporaryDirectory
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
||||
|
|
|
@ -6,7 +6,7 @@ flexible setup in various environments.
|
|||
"""
|
||||
|
||||
import argparse
|
||||
from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
from ruamel.yaml import YAML, CommentedMap
|
||||
import json
|
||||
|
||||
from loguru import logger
|
||||
|
@ -14,7 +14,6 @@ from loguru import logger
|
|||
from copy import deepcopy
|
||||
from auto_archiver.core.consts import MODULE_TYPES
|
||||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
_yaml: YAML = YAML()
|
||||
|
||||
|
|
|
@ -7,12 +7,9 @@ Factory method to initialize an extractor instance based on its name.
|
|||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import mimetypes
|
||||
import os
|
||||
import mimetypes
|
||||
import requests
|
||||
from loguru import logger
|
||||
from retrying import retry
|
||||
|
|
|
@ -13,7 +13,7 @@ from __future__ import annotations
|
|||
import hashlib
|
||||
from typing import Any, List, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
from dataclasses_json import dataclass_json
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
|
|
@ -123,7 +123,7 @@ Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_typ
|
|||
)
|
||||
if module_type == "extractor" and config["steps"].get("archivers"):
|
||||
raise SetupError(
|
||||
f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
|
||||
)
|
||||
raise SetupError(
|
||||
|
|
|
@ -135,7 +135,7 @@ class GDriveStorage(Storage):
|
|||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||
query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
|
||||
if use_mime_type:
|
||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||
query_string += " and mimeType='application/vnd.google-apps.folder' "
|
||||
|
||||
for attempt in range(retries):
|
||||
results = (
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import datetime, os
|
||||
import datetime
|
||||
import os
|
||||
import importlib
|
||||
import subprocess
|
||||
from typing import Generator, Type
|
||||
|
@ -386,7 +387,7 @@ class GenericExtractor(Extractor):
|
|||
item.set("replaced_url", url)
|
||||
|
||||
ydl_options = {
|
||||
"outtmpl": os.path.join(self.tmp_dir, f"%(id)s.%(ext)s"),
|
||||
"outtmpl": os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
|
||||
"quiet": False,
|
||||
"noplaylist": not self.allow_playlist,
|
||||
"writesubtitles": self.subtitles,
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import re, mimetypes, json
|
||||
import re
|
||||
import mimetypes
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from loguru import logger
|
||||
|
@ -35,7 +37,7 @@ class Twitter(GenericDropin):
|
|||
result = Metadata()
|
||||
try:
|
||||
if not tweet.get("user") or not tweet.get("created_at"):
|
||||
raise ValueError(f"Error retreiving post. Are you sure it exists?")
|
||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||
timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
|
|
|
@ -20,7 +20,7 @@ from slugify import slugify
|
|||
from auto_archiver.core import Feeder, Database, Media
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||
from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
||||
from auto_archiver.utils.misc import get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsFeederDB(Feeder, Database):
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from __future__ import annotations
|
||||
import mimetypes, os, pathlib
|
||||
import mimetypes
|
||||
import os
|
||||
import pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from urllib.parse import quote
|
||||
from loguru import logger
|
||||
|
|
|
@ -95,7 +95,7 @@ class InstagramAPIExtractor(Extractor):
|
|||
result.set_title(user.get("full_name", username)).set("data", user)
|
||||
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
|
||||
filename = self.download_from_url(pic_url)
|
||||
result.add_media(Media(filename=filename), id=f"profile_picture")
|
||||
result.add_media(Media(filename=filename), id="profile_picture")
|
||||
|
||||
if self.full_profile:
|
||||
user_id = user.get("pk")
|
||||
|
@ -133,7 +133,7 @@ class InstagramAPIExtractor(Extractor):
|
|||
|
||||
def download_all_highlights(self, result, username, user_id):
|
||||
count_highlights = 0
|
||||
highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
|
||||
highlights = self.call_api("v1/user/highlights", {"user_id": user_id})
|
||||
for h in highlights:
|
||||
try:
|
||||
h_info = self._download_highlights_reusable(result, h.get("pk"))
|
||||
|
@ -151,9 +151,9 @@ class InstagramAPIExtractor(Extractor):
|
|||
|
||||
def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
|
||||
if id:
|
||||
post = self.call_api(f"v1/media/by/id", {"id": id})
|
||||
post = self.call_api("v1/media/by/id", {"id": id})
|
||||
else:
|
||||
post = self.call_api(f"v1/media/by/code", {"code": code})
|
||||
post = self.call_api("v1/media/by/code", {"code": code})
|
||||
assert post, f"Post {id or code} not found"
|
||||
|
||||
if caption_text := post.get("caption_text"):
|
||||
|
@ -173,7 +173,7 @@ class InstagramAPIExtractor(Extractor):
|
|||
return result.success("insta highlights")
|
||||
|
||||
def _download_highlights_reusable(self, result: Metadata, id: str) -> dict:
|
||||
full_h = self.call_api(f"v2/highlight/by/id", {"id": id})
|
||||
full_h = self.call_api("v2/highlight/by/id", {"id": id})
|
||||
h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
|
||||
assert h_info, f"Highlight {id} not found: {full_h=}"
|
||||
|
||||
|
@ -200,7 +200,7 @@ class InstagramAPIExtractor(Extractor):
|
|||
return result.success(f"insta stories {now}")
|
||||
|
||||
def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
|
||||
stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
|
||||
stories = self.call_api("v1/user/stories/by/username", {"username": username})
|
||||
if not stories or not len(stories):
|
||||
return []
|
||||
stories = stories[::-1] # newest to oldest
|
||||
|
@ -219,7 +219,7 @@ class InstagramAPIExtractor(Extractor):
|
|||
|
||||
post_count = 0
|
||||
while end_cursor != "":
|
||||
posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
|
||||
posts = self.call_api("v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
|
||||
if not len(posts) or not type(posts) == list or len(posts) != 2:
|
||||
break
|
||||
posts, end_cursor = posts[0], posts[1]
|
||||
|
@ -244,7 +244,7 @@ class InstagramAPIExtractor(Extractor):
|
|||
|
||||
tagged_count = 0
|
||||
while next_page_id != None:
|
||||
resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
|
||||
resp = self.call_api("v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
|
||||
posts = resp.get("response", {}).get("items", [])
|
||||
if not len(posts):
|
||||
break
|
||||
|
|
|
@ -4,7 +4,9 @@ highlights, and tagged posts. Authentication is required via username/password o
|
|||
|
||||
"""
|
||||
|
||||
import re, os, shutil
|
||||
import re
|
||||
import os
|
||||
import shutil
|
||||
import instaloader
|
||||
from loguru import logger
|
||||
|
||||
|
@ -36,9 +38,9 @@ class InstagramExtractor(Extractor):
|
|||
)
|
||||
try:
|
||||
self.insta.load_session_from_file(self.username, self.session_file)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
try:
|
||||
logger.debug(f"Session file failed", exc_info=True)
|
||||
logger.debug("Session file failed", exc_info=True)
|
||||
logger.info("No valid session file found - Attempting login with use and password.")
|
||||
self.insta.login(self.username, self.password)
|
||||
self.insta.save_session_to_file(self.session_file)
|
||||
|
|
|
@ -51,7 +51,7 @@ class InstagramTbotExtractor(Extractor):
|
|||
"""Initializes the Telegram client."""
|
||||
try:
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
except OperationalError as e:
|
||||
except OperationalError:
|
||||
logger.error(
|
||||
f"Unable to access the {self.session_file} session. "
|
||||
"Ensure that you don't use the same session file here and in telethon_extractor. "
|
||||
|
@ -68,7 +68,7 @@ class InstagramTbotExtractor(Extractor):
|
|||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
if not "instagram.com" in url:
|
||||
if "instagram.com" not in url:
|
||||
return False
|
||||
|
||||
result = Metadata()
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from loguru import logger
|
||||
import time, os
|
||||
import time
|
||||
import os
|
||||
import base64
|
||||
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import ssl, os
|
||||
import ssl
|
||||
import os
|
||||
from slugify import slugify
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import requests, re, html
|
||||
import requests
|
||||
import re
|
||||
import html
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
|
|
|
@ -10,7 +10,9 @@ from telethon.errors.rpcerrorlist import (
|
|||
)
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import re, time, os
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
@ -63,11 +65,11 @@ class TelethonExtractor(Extractor):
|
|||
logger.warning(
|
||||
f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting."
|
||||
)
|
||||
except ValueError as e:
|
||||
except ValueError:
|
||||
logger.info(f"joining new channel {invite=}")
|
||||
try:
|
||||
self.client(ImportChatInviteRequest(match.group(2)))
|
||||
except UserAlreadyParticipantError as e:
|
||||
except UserAlreadyParticipantError:
|
||||
logger.info(f"already joined {invite=}")
|
||||
except InviteRequestSentError:
|
||||
logger.warning(f"already sent a join request with {invite} still no answer")
|
||||
|
|
|
@ -7,7 +7,8 @@ and identify important moments without watching the entire video.
|
|||
|
||||
"""
|
||||
|
||||
import ffmpeg, os
|
||||
import ffmpeg
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import jsonlines
|
||||
import mimetypes
|
||||
import os, shutil, subprocess
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from zipfile import ZipFile
|
||||
from loguru import logger
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
@ -186,7 +188,6 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
# get media out of .warc
|
||||
counter = 0
|
||||
seen_urls = set()
|
||||
import json
|
||||
|
||||
with open(warc_filename, "rb") as warc_stream:
|
||||
for record in ArchiveIterator(warc_stream):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
import time
|
||||
import requests
|
||||
|
||||
from auto_archiver.core import Extractor, Enricher
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
|
@ -57,7 +58,7 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||
if not job_id:
|
||||
logger.error(f"Wayback failed with {r.json()}")
|
||||
return False
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
except json.decoder.JSONDecodeError:
|
||||
logger.error(f"Expected a JSON with job_id from Wayback and got {r.text}")
|
||||
return False
|
||||
|
||||
|
@ -80,7 +81,7 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
|||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f"RequestException: fetching status for {url=} due to: {e}")
|
||||
break
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
except json.decoder.JSONDecodeError:
|
||||
logger.error(f"Expected a JSON from Wayback and got {r.text} for {url=}")
|
||||
break
|
||||
except Exception as e:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import traceback
|
||||
import requests, time
|
||||
import requests
|
||||
import time
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
|
@ -16,7 +17,7 @@ class WhisperEnricher(Enricher):
|
|||
def setup(self) -> None:
|
||||
self.stores = self.config["steps"]["storages"]
|
||||
self.s3 = self.module_factory.get_module("s3_storage", self.config)
|
||||
if not "s3_storage" in self.stores:
|
||||
if "s3_storage" not in self.stores:
|
||||
logger.error(
|
||||
"WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called."
|
||||
)
|
||||
|
|
|
@ -66,15 +66,15 @@ class CookieSettingDriver(webdriver.Firefox):
|
|||
|
||||
if self.facebook_accept_cookies:
|
||||
try:
|
||||
logger.debug(f"Trying fb click accept cookie popup.")
|
||||
logger.debug("Trying fb click accept cookie popup.")
|
||||
super(CookieSettingDriver, self).get("http://www.facebook.com")
|
||||
essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
|
||||
essential_only.click()
|
||||
logger.debug(f"fb click worked")
|
||||
logger.debug("fb click worked")
|
||||
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed on fb accept cookies.", e)
|
||||
logger.warning("Failed on fb accept cookies.", e)
|
||||
|
||||
# now get the actual URL
|
||||
super(CookieSettingDriver, self).get(url)
|
||||
|
|
Ładowanie…
Reference in New Issue