pull/72/head
msramalho 2023-01-27 00:19:58 +00:00
rodzic ac000d5943
commit d1e4dde3f6
28 zmienionych plików z 38 dodań i 161 usunięć

Wyświetl plik

@ -1 +0,0 @@
# from .auto_archiver import *

Wyświetl plik

@ -1,15 +1,3 @@
# we need to explicitly expose the available imports here
# from .base_archiver import Archiver, ArchiveResult
# from .telegram_archiver import TelegramArchiver
# from .telethon_archiver import TelethonArchiver
# from .tiktok_archiver import TiktokArchiver
# from .wayback_archiver import WaybackArchiver
# from .youtubedl_archiver import YoutubeDLArchiver
# from .twitter_archiver import TwitterArchiver
# from .vk_archiver import VkArchiver
# from .twitter_api_archiver import TwitterApiArchiver
# from .instagram_archiver import InstagramArchiver
from .archiver import Archiver
from .telethon_archiver import TelethonArchiver
from .twitter_archiver import TwitterArchiver

Wyświetl plik

@ -1,4 +1,4 @@
import re, os, shutil, html, traceback
import re, os, shutil, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger

Wyświetl plik

@ -1,12 +1,9 @@
import requests, re
import html
import requests, re, html
from bs4 import BeautifulSoup
from loguru import logger
from . import Archiver
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media
class TelegramArchiver(Archiver):

Wyświetl plik

@ -1,7 +1,6 @@
from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError
from telethon.tl.types import PeerUser, PeerChat, PeerChannel
from telethon.tl.functions.messages import ImportChatInviteRequest
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
from loguru import logger
@ -9,8 +8,7 @@ from tqdm import tqdm
import re, time, json, os
from . import Archiver
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media
class TelethonArchiver(Archiver):

Wyświetl plik

@ -1,13 +1,9 @@
import json
import os, traceback
import re
import uuid
import json, os, traceback, uuid
import tiktok_downloader
from loguru import logger
from . import Archiver
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media
class TiktokArchiver(Archiver):
@ -19,7 +15,7 @@ class TiktokArchiver(Archiver):
@staticmethod
def configs() -> dict:
return {}
def is_rearchivable(self, url: str) -> bool:
# TikTok posts are static
return False
@ -44,7 +40,6 @@ class TiktokArchiver(Archiver):
error = traceback.format_exc()
logger.warning(f'Other Tiktok error {error}')
try:
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
tiktok_media = tiktok_downloader.snaptik(url).get_media()

Wyświetl plik

@ -1,16 +1,13 @@
import json
import json, mimetypes
from datetime import datetime
import mimetypes
import os
from loguru import logger
from pytwitter import Api
from slugify import slugify
from . import Archiver
from .twitter_archiver import TwitterArchiver
from ..core import Metadata
from ..core import Media
from ..core import Metadata,Media
class TwitterApiArchiver(TwitterArchiver, Archiver):

Wyświetl plik

@ -1,15 +1,11 @@
import html, re, requests
import mimetypes
import json
import os
import re, requests, mimetypes, json
from datetime import datetime
from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from slugify import slugify
from . import Archiver
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media
class TwitterArchiver(Archiver):

Wyświetl plik

@ -3,8 +3,7 @@ from vk_url_scraper import VkScraper
from ..utils.misc import dump_payload
from . import Archiver
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media
class VkArchiver(Archiver):

Wyświetl plik

@ -1,12 +1,8 @@
import datetime
import os
import yt_dlp
import datetime, os, yt_dlp
from loguru import logger
from . import Archiver
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media
class YoutubeDLArchiver(Archiver):
@ -22,6 +18,7 @@ class YoutubeDLArchiver(Archiver):
}
def download(self, item: Metadata) -> Metadata:
#TODO: yt-dlp for transcripts?
url = item.get_url()
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:

Wyświetl plik

@ -1,30 +0,0 @@
import tempfile, json
import auto_archive
from loguru import logger
from configs import Config
from storages import Storage
from slugify import slugify
def main():
c = Config()
c.parse()
url = c.url
if not url:
logger.error("Invalid URL: '{url}'")
return
logger.info(f'Archiving "{url=}".')
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
Storage.TMP_FOLDER = tmpdir
result = auto_archive.archive_url(c, url, "", f"{url=}", False)
c.destroy_webdriver()
key = f"media_{slugify(url)}.json"
with open(key, "w", encoding="utf-8") as outf:
json.dump(result.media, outf, ensure_ascii=False, indent=4)
c.get_storage().upload(key, key)
print(result)
return result
if __name__ == "__main__":
main()

Wyświetl plik

@ -11,8 +11,8 @@ from ..feeders import Feeder
from ..databases import Database
from ..formatters import Formatter
from ..storages import Storage
from . import Step
from ..enrichers import Enricher
from . import Step
@dataclass

Wyświetl plik

@ -6,13 +6,11 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
import datetime
from urllib.parse import urlparse
from loguru import logger
from dateutil.parser import parse as parse_dt
from .media import Media
# annotation order matters
@dataclass_json
@dataclass
class Metadata:
@ -72,6 +70,7 @@ class Metadata:
# custom getter/setters
def set_url(self, url: str) -> Metadata:
assert type(url) is str and len(url) > 0, "invalid URL"
return self.set("url", url)

Wyświetl plik

@ -15,49 +15,11 @@ import tempfile, traceback
from loguru import logger
"""
how not to couple the different pieces of logic
due to the use of constants for the metadata keys?
perhaps having methods on the Metadata level that can be used to fetch a limited number of
keys, never using strings but rather methods?
eg: m = Metadata()
m.get("screenshot") vs m.get_all()
m.get_url()
m.get_hash()
m.get_main_file().get_title()
m.get_screenshot() # this method should only exist because of the Screenshot Enricher
# maybe there is a way for Archivers and Enrichers and Storages to add their own methdods
# which raises still the Q of how the database, eg., knows they exist?
# maybe there's a function to fetch them all, and each Database can register wathever they get
# for eg the GoogleSheets will only register based on the available column names, it knows what it wants
# and if it's there: great, otherwise business as usual.
# and a MongoDatabase could register all data, for example.
#
How are Orchestrators created? from a configuration file?
orchestrator = ArchivingOrchestrator(config)
# Config contains 1 URL, or URLs, from the command line
# OR a feeder which is described in the config file
# config.get_feeder() # if called as docker run --url "http...." then the uses the default filter
# if config.yaml says config
orchestrator.start()
Example applications:
1. auto-archiver for GSheets
2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2"
3. archiver backend for a UI that implements a REST API, the API calls CLI
Cisticola considerations:
1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass)
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
"""
class ArchivingOrchestrator:
def __init__(self, config) -> None:
self.feeder: Feeder = config.feeder
self.formatter: Formatter = config.formatter
self.enrichers = config.enrichers
self.enrichers: List[Enricher] = config.enrichers
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages
@ -124,7 +86,7 @@ class ArchivingOrchestrator:
# 3 - call archivers until one succeeds
for a in self.archivers:
logger.info(f"Trying archiver {a.name}")
try:
try:
# Q: should this be refactored so it's just a.download(result)?
result.merge(a.download(result))
if result.is_success(): break

Wyświetl plik

@ -2,7 +2,6 @@ from __future__ import annotations
from dataclasses import dataclass, field
from inspect import ClassFoundException
from typing import Type
from ..core import Metadata
from abc import ABC
# from collections.abc import Iterable

Wyświetl plik

@ -1,10 +1,11 @@
import os
from loguru import logger
from csv import DictWriter
from dataclasses import asdict
from . import Database
from ..core import Metadata
from csv import DictWriter
from dataclasses import asdict
class CSVDb(Database):
"""

Wyświetl plik

@ -2,8 +2,8 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from typing import Union
from ..core import Metadata
from ..core import Step
from ..core import Metadata, Step
@dataclass

Wyświetl plik

@ -1,5 +1,5 @@
from typing import Union, Tuple
import gspread, datetime
import datetime
# from metadata import Metadata
from loguru import logger
@ -8,7 +8,6 @@ from loguru import logger
from . import Database
from ..core import Metadata
from ..core import Media
from ..utils import Gsheets
from ..utils import GWorksheet

Wyświetl plik

@ -1,8 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from ..core import Metadata
from ..core import Step
from ..core import Metadata, Step
@dataclass
class Enricher(Step, ABC):

Wyświetl plik

@ -1,10 +1,7 @@
import hashlib
from loguru import logger
from selenium.common.exceptions import TimeoutException
import time, requests
from . import Enricher
from ..utils import Webdriver
from ..core import Metadata

Wyświetl plik

@ -4,8 +4,7 @@ from selenium.common.exceptions import TimeoutException
from . import Enricher
from ..utils import Webdriver
from ..core import Media
from ..core import Metadata
from ..core import Media, Metadata
class ScreenshotEnricher(Enricher):
name = "screenshot_enricher"

Wyświetl plik

@ -1,10 +1,9 @@
import uuid
import ffmpeg, os, uuid
from loguru import logger
import ffmpeg, os
from . import Enricher
from ..core import Media
from ..core import Metadata
from ..core import Media, Metadata
class ThumbnailEnricher(Enricher):
"""

Wyświetl plik

@ -1,12 +1,7 @@
import os
import shutil
import subprocess
import uuid
import os, shutil, subprocess, uuid
from loguru import logger
import time, requests
from ..core import Media
from ..core import Metadata
from ..core import Media, Metadata
from . import Enricher

Wyświetl plik

@ -1,9 +1,5 @@
import gspread, os
# from metadata import Metadata
from loguru import logger
# from . import Enricher
from . import Feeder
from ..core import Metadata

Wyświetl plik

@ -1,14 +1,12 @@
import gspread, os
# from metadata import Metadata
from loguru import logger
from slugify import slugify
# from . import Enricher
from . import Feeder
from ..core import Metadata
from ..utils import Gsheets
from ..utils import GWorksheet
from ..utils import Gsheets, GWorksheet
class GsheetsFeeder(Gsheets, Feeder):
name = "gsheet_feeder"

Wyświetl plik

@ -1,12 +1,10 @@
from __future__ import annotations
from dataclasses import dataclass
import mimetypes
import mimetypes, uuid, os, pathlib
from jinja2 import Environment, FileSystemLoader
import uuid, os, pathlib
from urllib.parse import quote
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media
from . import Formatter

Wyświetl plik

@ -1,5 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass
from ..core import Metadata, Media
from . import Formatter
@ -12,4 +13,4 @@ class MuteFormatter(Formatter):
# without this STEP.__init__ is not called
super().__init__(config)
def format(self, item: Metadata) -> Media: return None
def format(self, item: Metadata) -> Media: return None

Wyświetl plik

@ -1,6 +1,5 @@
import json, gspread
from loguru import logger
from ..core import Step