From d1e4dde3f60c92aa796d239dfb9d0cdc55d3d0b3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 27 Jan 2023 00:19:58 +0000 Subject: [PATCH] fixing imports --- src/__init__.py | 1 - src/auto_archiver/archivers/__init__.py | 12 ------ .../archivers/instagram_archiver.py | 2 +- .../archivers/telegram_archiver.py | 7 +--- .../archivers/telethon_archiver.py | 4 +- .../archivers/tiktok_archiver.py | 11 ++--- .../archivers/twitter_api_archiver.py | 7 +--- .../archivers/twitter_archiver.py | 8 +--- src/auto_archiver/archivers/vk_archiver.py | 3 +- .../archivers/youtubedl_archiver.py | 9 ++-- src/auto_archiver/cli.py | 30 ------------- src/auto_archiver/core/config.py | 2 +- src/auto_archiver/core/metadata.py | 5 +-- src/auto_archiver/core/orchestrator.py | 42 +------------------ src/auto_archiver/core/step.py | 1 - src/auto_archiver/databases/csv_db.py | 5 ++- src/auto_archiver/databases/database.py | 4 +- src/auto_archiver/databases/gsheet_db.py | 3 +- src/auto_archiver/enrichers/enricher.py | 3 +- src/auto_archiver/enrichers/hash_enricher.py | 3 -- .../enrichers/screenshot_enricher.py | 3 +- .../enrichers/thumbnail_enricher.py | 7 ++-- src/auto_archiver/enrichers/wacz_enricher.py | 9 +--- src/auto_archiver/feeders/cli_feeder.py | 4 -- src/auto_archiver/feeders/gsheet_feeder.py | 4 +- .../formatters/html_formatter.py | 6 +-- .../formatters/mute_formatter.py | 3 +- src/auto_archiver/utils/gsheet.py | 1 - 28 files changed, 38 insertions(+), 161 deletions(-) delete mode 100644 src/auto_archiver/cli.py diff --git a/src/__init__.py b/src/__init__.py index c06311a..e69de29 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1 +0,0 @@ -# from .auto_archiver import * \ No newline at end of file diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index b5c4faa..f6b5b05 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -1,15 +1,3 @@ -# we need to explicitly expose the available imports here -# from .base_archiver import Archiver, ArchiveResult -# from .telegram_archiver import TelegramArchiver -# from .telethon_archiver import TelethonArchiver -# from .tiktok_archiver import TiktokArchiver -# from .wayback_archiver import WaybackArchiver -# from .youtubedl_archiver import YoutubeDLArchiver -# from .twitter_archiver import TwitterArchiver -# from .vk_archiver import VkArchiver -# from .twitter_api_archiver import TwitterApiArchiver -# from .instagram_archiver import InstagramArchiver - from .archiver import Archiver from .telethon_archiver import TelethonArchiver from .twitter_archiver import TwitterArchiver diff --git a/src/auto_archiver/archivers/instagram_archiver.py b/src/auto_archiver/archivers/instagram_archiver.py index c0d1b2c..97dd172 100644 --- a/src/auto_archiver/archivers/instagram_archiver.py +++ b/src/auto_archiver/archivers/instagram_archiver.py @@ -1,4 +1,4 @@ -import re, os, shutil, html, traceback +import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger diff --git a/src/auto_archiver/archivers/telegram_archiver.py b/src/auto_archiver/archivers/telegram_archiver.py index 6df421b..6ddcf0e 100644 --- a/src/auto_archiver/archivers/telegram_archiver.py +++ b/src/auto_archiver/archivers/telegram_archiver.py @@ -1,12 +1,9 @@ -import requests, re - -import html +import requests, re, html from bs4 import BeautifulSoup from loguru import logger from . import Archiver -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media class TelegramArchiver(Archiver): diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/archivers/telethon_archiver.py index e0e2bed..99af97f 100644 --- a/src/auto_archiver/archivers/telethon_archiver.py +++ b/src/auto_archiver/archivers/telethon_archiver.py @@ -1,7 +1,6 @@ from telethon.sync import TelegramClient from telethon.errors import ChannelInvalidError -from telethon.tl.types import PeerUser, PeerChat, PeerChannel from telethon.tl.functions.messages import ImportChatInviteRequest from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError from loguru import logger @@ -9,8 +8,7 @@ from tqdm import tqdm import re, time, json, os from . import Archiver -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media class TelethonArchiver(Archiver): diff --git a/src/auto_archiver/archivers/tiktok_archiver.py b/src/auto_archiver/archivers/tiktok_archiver.py index 532df25..601ef51 100644 --- a/src/auto_archiver/archivers/tiktok_archiver.py +++ b/src/auto_archiver/archivers/tiktok_archiver.py @@ -1,13 +1,9 @@ -import json -import os, traceback -import re -import uuid +import json, os, traceback, uuid import tiktok_downloader from loguru import logger from . import Archiver -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media class TiktokArchiver(Archiver): @@ -19,7 +15,7 @@ class TiktokArchiver(Archiver): @staticmethod def configs() -> dict: return {} - + def is_rearchivable(self, url: str) -> bool: # TikTok posts are static return False @@ -44,7 +40,6 @@ class TiktokArchiver(Archiver): error = traceback.format_exc() logger.warning(f'Other Tiktok error {error}') - try: filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4') tiktok_media = tiktok_downloader.snaptik(url).get_media() diff --git a/src/auto_archiver/archivers/twitter_api_archiver.py b/src/auto_archiver/archivers/twitter_api_archiver.py index 821d8d4..cd1eb1c 100644 --- a/src/auto_archiver/archivers/twitter_api_archiver.py +++ b/src/auto_archiver/archivers/twitter_api_archiver.py @@ -1,16 +1,13 @@ -import json +import json, mimetypes from datetime import datetime -import mimetypes -import os from loguru import logger from pytwitter import Api from slugify import slugify from . import Archiver from .twitter_archiver import TwitterArchiver -from ..core import Metadata -from ..core import Media +from ..core import Metadata,Media class TwitterApiArchiver(TwitterArchiver, Archiver): diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py index a811abf..1128a57 100644 --- a/src/auto_archiver/archivers/twitter_archiver.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -1,15 +1,11 @@ -import html, re, requests -import mimetypes -import json -import os +import re, requests, mimetypes, json from datetime import datetime from loguru import logger from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from slugify import slugify from . import Archiver -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media class TwitterArchiver(Archiver): diff --git a/src/auto_archiver/archivers/vk_archiver.py b/src/auto_archiver/archivers/vk_archiver.py index b1febe0..bbc3456 100644 --- a/src/auto_archiver/archivers/vk_archiver.py +++ b/src/auto_archiver/archivers/vk_archiver.py @@ -3,8 +3,7 @@ from vk_url_scraper import VkScraper from ..utils.misc import dump_payload from . import Archiver -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media class VkArchiver(Archiver): diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py index 443fb17..136fecf 100644 --- a/src/auto_archiver/archivers/youtubedl_archiver.py +++ b/src/auto_archiver/archivers/youtubedl_archiver.py @@ -1,12 +1,8 @@ -import datetime -import os - -import yt_dlp +import datetime, os, yt_dlp from loguru import logger from . import Archiver -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media class YoutubeDLArchiver(Archiver): @@ -22,6 +18,7 @@ class YoutubeDLArchiver(Archiver): } def download(self, item: Metadata) -> Metadata: + #TODO: yt-dlp for transcripts? url = item.get_url() if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: diff --git a/src/auto_archiver/cli.py b/src/auto_archiver/cli.py deleted file mode 100644 index b6d2b70..0000000 --- a/src/auto_archiver/cli.py +++ /dev/null @@ -1,30 +0,0 @@ -import tempfile, json -import auto_archive -from loguru import logger -from configs import Config -from storages import Storage -from slugify import slugify - - -def main(): - c = Config() - c.parse() - url = c.url - if not url: - logger.error("Invalid URL: '{url}'") - return - logger.info(f'Archiving "{url=}".') - with tempfile.TemporaryDirectory(dir="./") as tmpdir: - Storage.TMP_FOLDER = tmpdir - result = auto_archive.archive_url(c, url, "", f"{url=}", False) - c.destroy_webdriver() - key = f"media_{slugify(url)}.json" - with open(key, "w", encoding="utf-8") as outf: - json.dump(result.media, outf, ensure_ascii=False, indent=4) - c.get_storage().upload(key, key) - print(result) - return result - - -if __name__ == "__main__": - main() diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 5aa725d..ece9102 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -11,8 +11,8 @@ from ..feeders import Feeder from ..databases import Database from ..formatters import Formatter from ..storages import Storage -from . import Step from ..enrichers import Enricher +from . import Step @dataclass diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index d666843..7a741f7 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -6,13 +6,11 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json import datetime from urllib.parse import urlparse -from loguru import logger from dateutil.parser import parse as parse_dt from .media import Media + # annotation order matters - - @dataclass_json @dataclass class Metadata: @@ -72,6 +70,7 @@ class Metadata: # custom getter/setters + def set_url(self, url: str) -> Metadata: assert type(url) is str and len(url) > 0, "invalid URL" return self.set("url", url) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index e7940a4..3e14498 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -15,49 +15,11 @@ import tempfile, traceback from loguru import logger -""" -how not to couple the different pieces of logic -due to the use of constants for the metadata keys? -perhaps having methods on the Metadata level that can be used to fetch a limited number of -keys, never using strings but rather methods? -eg: m = Metadata() - m.get("screenshot") vs m.get_all() - m.get_url() - m.get_hash() - m.get_main_file().get_title() - m.get_screenshot() # this method should only exist because of the Screenshot Enricher - # maybe there is a way for Archivers and Enrichers and Storages to add their own methdods - # which raises still the Q of how the database, eg., knows they exist? - # maybe there's a function to fetch them all, and each Database can register wathever they get - # for eg the GoogleSheets will only register based on the available column names, it knows what it wants - # and if it's there: great, otherwise business as usual. - # and a MongoDatabase could register all data, for example. - # -How are Orchestrators created? from a configuration file? - orchestrator = ArchivingOrchestrator(config) - # Config contains 1 URL, or URLs, from the command line - # OR a feeder which is described in the config file - # config.get_feeder() # if called as docker run --url "http...." then the uses the default filter - # if config.yaml says config - orchestrator.start() - - -Example applications: -1. auto-archiver for GSheets -2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2" -3. archiver backend for a UI that implements a REST API, the API calls CLI - -Cisticola considerations: -1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass) -2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping -""" - - class ArchivingOrchestrator: def __init__(self, config) -> None: self.feeder: Feeder = config.feeder self.formatter: Formatter = config.formatter - self.enrichers = config.enrichers + self.enrichers: List[Enricher] = config.enrichers self.archivers: List[Archiver] = config.archivers self.databases: List[Database] = config.databases self.storages: List[Storage] = config.storages @@ -124,7 +86,7 @@ class ArchivingOrchestrator: # 3 - call archivers until one succeeds for a in self.archivers: logger.info(f"Trying archiver {a.name}") - try: + try: # Q: should this be refactored so it's just a.download(result)? result.merge(a.download(result)) if result.is_success(): break diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py index 8cfdecd..6ac6648 100644 --- a/src/auto_archiver/core/step.py +++ b/src/auto_archiver/core/step.py @@ -2,7 +2,6 @@ from __future__ import annotations from dataclasses import dataclass, field from inspect import ClassFoundException from typing import Type -from ..core import Metadata from abc import ABC # from collections.abc import Iterable diff --git a/src/auto_archiver/databases/csv_db.py b/src/auto_archiver/databases/csv_db.py index 72c804b..0743047 100644 --- a/src/auto_archiver/databases/csv_db.py +++ b/src/auto_archiver/databases/csv_db.py @@ -1,10 +1,11 @@ import os from loguru import logger +from csv import DictWriter +from dataclasses import asdict from . import Database from ..core import Metadata -from csv import DictWriter -from dataclasses import asdict + class CSVDb(Database): """ diff --git a/src/auto_archiver/databases/database.py b/src/auto_archiver/databases/database.py index 133afaa..01b7869 100644 --- a/src/auto_archiver/databases/database.py +++ b/src/auto_archiver/databases/database.py @@ -2,8 +2,8 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC from typing import Union -from ..core import Metadata -from ..core import Step + +from ..core import Metadata, Step @dataclass diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py index de1548e..8721725 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -1,5 +1,5 @@ from typing import Union, Tuple -import gspread, datetime +import datetime # from metadata import Metadata from loguru import logger @@ -8,7 +8,6 @@ from loguru import logger from . import Database from ..core import Metadata from ..core import Media -from ..utils import Gsheets from ..utils import GWorksheet diff --git a/src/auto_archiver/enrichers/enricher.py b/src/auto_archiver/enrichers/enricher.py index f67d9fe..4948d57 100644 --- a/src/auto_archiver/enrichers/enricher.py +++ b/src/auto_archiver/enrichers/enricher.py @@ -1,8 +1,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC -from ..core import Metadata -from ..core import Step +from ..core import Metadata, Step @dataclass class Enricher(Step, ABC): diff --git a/src/auto_archiver/enrichers/hash_enricher.py b/src/auto_archiver/enrichers/hash_enricher.py index 7970b17..35d9ebb 100644 --- a/src/auto_archiver/enrichers/hash_enricher.py +++ b/src/auto_archiver/enrichers/hash_enricher.py @@ -1,10 +1,7 @@ import hashlib from loguru import logger -from selenium.common.exceptions import TimeoutException -import time, requests from . import Enricher -from ..utils import Webdriver from ..core import Metadata diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index 0ae2e29..a953d16 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -4,8 +4,7 @@ from selenium.common.exceptions import TimeoutException from . import Enricher from ..utils import Webdriver -from ..core import Media -from ..core import Metadata +from ..core import Media, Metadata class ScreenshotEnricher(Enricher): name = "screenshot_enricher" diff --git a/src/auto_archiver/enrichers/thumbnail_enricher.py b/src/auto_archiver/enrichers/thumbnail_enricher.py index f1a3149..2958213 100644 --- a/src/auto_archiver/enrichers/thumbnail_enricher.py +++ b/src/auto_archiver/enrichers/thumbnail_enricher.py @@ -1,10 +1,9 @@ -import uuid +import ffmpeg, os, uuid from loguru import logger -import ffmpeg, os from . import Enricher -from ..core import Media -from ..core import Metadata +from ..core import Media, Metadata + class ThumbnailEnricher(Enricher): """ diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py index 504b6ec..49bc6a8 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -1,12 +1,7 @@ -import os -import shutil -import subprocess -import uuid +import os, shutil, subprocess, uuid from loguru import logger -import time, requests -from ..core import Media -from ..core import Metadata +from ..core import Media, Metadata from . import Enricher diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/feeders/cli_feeder.py index abc7602..8de3601 100644 --- a/src/auto_archiver/feeders/cli_feeder.py +++ b/src/auto_archiver/feeders/cli_feeder.py @@ -1,9 +1,5 @@ -import gspread, os - -# from metadata import Metadata from loguru import logger -# from . import Enricher from . import Feeder from ..core import Metadata diff --git a/src/auto_archiver/feeders/gsheet_feeder.py b/src/auto_archiver/feeders/gsheet_feeder.py index c73015f..d5d3fcf 100644 --- a/src/auto_archiver/feeders/gsheet_feeder.py +++ b/src/auto_archiver/feeders/gsheet_feeder.py @@ -1,14 +1,12 @@ import gspread, os -# from metadata import Metadata from loguru import logger from slugify import slugify # from . import Enricher from . import Feeder from ..core import Metadata -from ..utils import Gsheets -from ..utils import GWorksheet +from ..utils import Gsheets, GWorksheet class GsheetsFeeder(Gsheets, Feeder): name = "gsheet_feeder" diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/formatters/html_formatter.py index 72db4c0..317a5d0 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/formatters/html_formatter.py @@ -1,12 +1,10 @@ from __future__ import annotations from dataclasses import dataclass -import mimetypes +import mimetypes, uuid, os, pathlib from jinja2 import Environment, FileSystemLoader -import uuid, os, pathlib from urllib.parse import quote -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media from . import Formatter diff --git a/src/auto_archiver/formatters/mute_formatter.py b/src/auto_archiver/formatters/mute_formatter.py index 81b89b5..19830b1 100644 --- a/src/auto_archiver/formatters/mute_formatter.py +++ b/src/auto_archiver/formatters/mute_formatter.py @@ -1,5 +1,6 @@ from __future__ import annotations from dataclasses import dataclass + from ..core import Metadata, Media from . import Formatter @@ -12,4 +13,4 @@ class MuteFormatter(Formatter): # without this STEP.__init__ is not called super().__init__(config) - def format(self, item: Metadata) -> Media: return None \ No newline at end of file + def format(self, item: Metadata) -> Media: return None diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index 98259f8..f793297 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -1,6 +1,5 @@ import json, gspread -from loguru import logger from ..core import Step