kopia lustrzana https://github.com/bellingcat/auto-archiver
fixing imports
rodzic
ac000d5943
commit
d1e4dde3f6
|
@ -1 +0,0 @@
|
|||
# from .auto_archiver import *
|
|
@ -1,15 +1,3 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
# from .base_archiver import Archiver, ArchiveResult
|
||||
# from .telegram_archiver import TelegramArchiver
|
||||
# from .telethon_archiver import TelethonArchiver
|
||||
# from .tiktok_archiver import TiktokArchiver
|
||||
# from .wayback_archiver import WaybackArchiver
|
||||
# from .youtubedl_archiver import YoutubeDLArchiver
|
||||
# from .twitter_archiver import TwitterArchiver
|
||||
# from .vk_archiver import VkArchiver
|
||||
# from .twitter_api_archiver import TwitterApiArchiver
|
||||
# from .instagram_archiver import InstagramArchiver
|
||||
|
||||
from .archiver import Archiver
|
||||
from .telethon_archiver import TelethonArchiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import re, os, shutil, html, traceback
|
||||
import re, os, shutil, traceback
|
||||
import instaloader # https://instaloader.github.io/as-module.html
|
||||
from loguru import logger
|
||||
|
||||
|
|
|
@ -1,12 +1,9 @@
|
|||
import requests, re
|
||||
|
||||
import html
|
||||
import requests, re, html
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media
|
||||
|
||||
|
||||
class TelegramArchiver(Archiver):
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.errors import ChannelInvalidError
|
||||
from telethon.tl.types import PeerUser, PeerChat, PeerChannel
|
||||
from telethon.tl.functions.messages import ImportChatInviteRequest
|
||||
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
|
||||
from loguru import logger
|
||||
|
@ -9,8 +8,7 @@ from tqdm import tqdm
|
|||
import re, time, json, os
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media
|
||||
|
||||
|
||||
class TelethonArchiver(Archiver):
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
import json
|
||||
import os, traceback
|
||||
import re
|
||||
import uuid
|
||||
import json, os, traceback, uuid
|
||||
import tiktok_downloader
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media
|
||||
|
||||
|
||||
class TiktokArchiver(Archiver):
|
||||
|
@ -19,7 +15,7 @@ class TiktokArchiver(Archiver):
|
|||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# TikTok posts are static
|
||||
return False
|
||||
|
@ -44,7 +40,6 @@ class TiktokArchiver(Archiver):
|
|||
error = traceback.format_exc()
|
||||
logger.warning(f'Other Tiktok error {error}')
|
||||
|
||||
|
||||
try:
|
||||
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
|
||||
tiktok_media = tiktok_downloader.snaptik(url).get_media()
|
||||
|
|
|
@ -1,16 +1,13 @@
|
|||
|
||||
import json
|
||||
import json, mimetypes
|
||||
from datetime import datetime
|
||||
import mimetypes
|
||||
import os
|
||||
from loguru import logger
|
||||
from pytwitter import Api
|
||||
from slugify import slugify
|
||||
|
||||
from . import Archiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata,Media
|
||||
|
||||
|
||||
class TwitterApiArchiver(TwitterArchiver, Archiver):
|
||||
|
|
|
@ -1,15 +1,11 @@
|
|||
import html, re, requests
|
||||
import mimetypes
|
||||
import json
|
||||
import os
|
||||
import re, requests, mimetypes, json
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
from slugify import slugify
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media
|
||||
|
||||
|
||||
class TwitterArchiver(Archiver):
|
||||
|
|
|
@ -3,8 +3,7 @@ from vk_url_scraper import VkScraper
|
|||
|
||||
from ..utils.misc import dump_payload
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media
|
||||
|
||||
|
||||
class VkArchiver(Archiver):
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
import datetime
|
||||
import os
|
||||
|
||||
import yt_dlp
|
||||
import datetime, os, yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
|
@ -22,6 +18,7 @@ class YoutubeDLArchiver(Archiver):
|
|||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
#TODO: yt-dlp for transcripts?
|
||||
url = item.get_url()
|
||||
|
||||
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
import tempfile, json
|
||||
import auto_archive
|
||||
from loguru import logger
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
from slugify import slugify
|
||||
|
||||
|
||||
def main():
|
||||
c = Config()
|
||||
c.parse()
|
||||
url = c.url
|
||||
if not url:
|
||||
logger.error("Invalid URL: '{url}'")
|
||||
return
|
||||
logger.info(f'Archiving "{url=}".')
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||
Storage.TMP_FOLDER = tmpdir
|
||||
result = auto_archive.archive_url(c, url, "", f"{url=}", False)
|
||||
c.destroy_webdriver()
|
||||
key = f"media_{slugify(url)}.json"
|
||||
with open(key, "w", encoding="utf-8") as outf:
|
||||
json.dump(result.media, outf, ensure_ascii=False, indent=4)
|
||||
c.get_storage().upload(key, key)
|
||||
print(result)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -11,8 +11,8 @@ from ..feeders import Feeder
|
|||
from ..databases import Database
|
||||
from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
from . import Step
|
||||
from ..enrichers import Enricher
|
||||
from . import Step
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
@ -6,13 +6,11 @@ from dataclasses import dataclass, field
|
|||
from dataclasses_json import dataclass_json
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from dateutil.parser import parse as parse_dt
|
||||
from .media import Media
|
||||
|
||||
|
||||
# annotation order matters
|
||||
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class Metadata:
|
||||
|
@ -72,6 +70,7 @@ class Metadata:
|
|||
|
||||
# custom getter/setters
|
||||
|
||||
|
||||
def set_url(self, url: str) -> Metadata:
|
||||
assert type(url) is str and len(url) > 0, "invalid URL"
|
||||
return self.set("url", url)
|
||||
|
|
|
@ -15,49 +15,11 @@ import tempfile, traceback
|
|||
from loguru import logger
|
||||
|
||||
|
||||
"""
|
||||
how not to couple the different pieces of logic
|
||||
due to the use of constants for the metadata keys?
|
||||
perhaps having methods on the Metadata level that can be used to fetch a limited number of
|
||||
keys, never using strings but rather methods?
|
||||
eg: m = Metadata()
|
||||
m.get("screenshot") vs m.get_all()
|
||||
m.get_url()
|
||||
m.get_hash()
|
||||
m.get_main_file().get_title()
|
||||
m.get_screenshot() # this method should only exist because of the Screenshot Enricher
|
||||
# maybe there is a way for Archivers and Enrichers and Storages to add their own methdods
|
||||
# which raises still the Q of how the database, eg., knows they exist?
|
||||
# maybe there's a function to fetch them all, and each Database can register wathever they get
|
||||
# for eg the GoogleSheets will only register based on the available column names, it knows what it wants
|
||||
# and if it's there: great, otherwise business as usual.
|
||||
# and a MongoDatabase could register all data, for example.
|
||||
#
|
||||
How are Orchestrators created? from a configuration file?
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
# Config contains 1 URL, or URLs, from the command line
|
||||
# OR a feeder which is described in the config file
|
||||
# config.get_feeder() # if called as docker run --url "http...." then the uses the default filter
|
||||
# if config.yaml says config
|
||||
orchestrator.start()
|
||||
|
||||
|
||||
Example applications:
|
||||
1. auto-archiver for GSheets
|
||||
2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2"
|
||||
3. archiver backend for a UI that implements a REST API, the API calls CLI
|
||||
|
||||
Cisticola considerations:
|
||||
1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass)
|
||||
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
||||
"""
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
def __init__(self, config) -> None:
|
||||
self.feeder: Feeder = config.feeder
|
||||
self.formatter: Formatter = config.formatter
|
||||
self.enrichers = config.enrichers
|
||||
self.enrichers: List[Enricher] = config.enrichers
|
||||
self.archivers: List[Archiver] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
self.storages: List[Storage] = config.storages
|
||||
|
@ -124,7 +86,7 @@ class ArchivingOrchestrator:
|
|||
# 3 - call archivers until one succeeds
|
||||
for a in self.archivers:
|
||||
logger.info(f"Trying archiver {a.name}")
|
||||
try:
|
||||
try:
|
||||
# Q: should this be refactored so it's just a.download(result)?
|
||||
result.merge(a.download(result))
|
||||
if result.is_success(): break
|
||||
|
|
|
@ -2,7 +2,6 @@ from __future__ import annotations
|
|||
from dataclasses import dataclass, field
|
||||
from inspect import ClassFoundException
|
||||
from typing import Type
|
||||
from ..core import Metadata
|
||||
from abc import ABC
|
||||
# from collections.abc import Iterable
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import os
|
||||
from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
|
||||
class CSVDb(Database):
|
||||
"""
|
||||
|
|
|
@ -2,8 +2,8 @@ from __future__ import annotations
|
|||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from typing import Union
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
from ..core import Metadata, Step
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Union, Tuple
|
||||
import gspread, datetime
|
||||
import datetime
|
||||
|
||||
# from metadata import Metadata
|
||||
from loguru import logger
|
||||
|
@ -8,7 +8,6 @@ from loguru import logger
|
|||
from . import Database
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..utils import Gsheets
|
||||
from ..utils import GWorksheet
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
from ..core import Metadata, Step
|
||||
|
||||
@dataclass
|
||||
class Enricher(Step, ABC):
|
||||
|
|
|
@ -1,10 +1,7 @@
|
|||
import hashlib
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time, requests
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -4,8 +4,7 @@ from selenium.common.exceptions import TimeoutException
|
|||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver
|
||||
from ..core import Media
|
||||
from ..core import Metadata
|
||||
from ..core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import uuid
|
||||
import ffmpeg, os, uuid
|
||||
from loguru import logger
|
||||
import ffmpeg, os
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Media
|
||||
from ..core import Metadata
|
||||
from ..core import Media, Metadata
|
||||
|
||||
|
||||
class ThumbnailEnricher(Enricher):
|
||||
"""
|
||||
|
|
|
@ -1,12 +1,7 @@
|
|||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import uuid
|
||||
import os, shutil, subprocess, uuid
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
from ..core import Media
|
||||
from ..core import Metadata
|
||||
from ..core import Media, Metadata
|
||||
from . import Enricher
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
import gspread, os
|
||||
|
||||
# from metadata import Metadata
|
||||
from loguru import logger
|
||||
|
||||
# from . import Enricher
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
import gspread, os
|
||||
|
||||
# from metadata import Metadata
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
# from . import Enricher
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
from ..utils import Gsheets
|
||||
from ..utils import GWorksheet
|
||||
from ..utils import Gsheets, GWorksheet
|
||||
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
name = "gsheet_feeder"
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
import mimetypes
|
||||
import mimetypes, uuid, os, pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
import uuid, os, pathlib
|
||||
from urllib.parse import quote
|
||||
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media
|
||||
from . import Formatter
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..core import Metadata, Media
|
||||
from . import Formatter
|
||||
|
||||
|
@ -12,4 +13,4 @@ class MuteFormatter(Formatter):
|
|||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
def format(self, item: Metadata) -> Media: return None
|
||||
def format(self, item: Metadata) -> Media: return None
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import json, gspread
|
||||
|
||||
from loguru import logger
|
||||
from ..core import Step
|
||||
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue