creating global context and refactoring tmp_dir logic

pull/74/head
msramalho 2023-03-23 11:17:38 +00:00
rodzic 39818e648a
commit 906ed0f6e0
16 zmienionych plików z 88 dodań i 34 usunięć

Wyświetl plik

@ -3,8 +3,8 @@ from abc import abstractmethod
from dataclasses import dataclass
import os
import mimetypes, requests
from ..core import Metadata
from ..core import Step
from ..core import Metadata, Step, ArchivingContext
@dataclass
@ -51,7 +51,7 @@ class Archiver(Step):
if len(to_filename) > 64:
to_filename = to_filename[-64:]
if item:
to_filename = os.path.join(item.get_tmp_dir(), to_filename)
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}

Wyświetl plik

@ -4,7 +4,7 @@ from loguru import logger
import time, os
from sqlite3 import OperationalError
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class InstagramTbotArchiver(Archiver):
@ -44,7 +44,7 @@ class InstagramTbotArchiver(Archiver):
if not "instagram.com" in url: return False
result = Metadata()
tmp_dir = item.get_tmp_dir()
tmp_dir = ArchivingContext.get_tmp_dir()
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id

Wyświetl plik

@ -8,7 +8,7 @@ from tqdm import tqdm
import re, time, json, os
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class TelethonArchiver(Archiver):
@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
tmp_dir = item.get_tmp_dir()
tmp_dir = ArchivingContext.get_tmp_dir()
group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message

Wyświetl plik

@ -3,7 +3,7 @@ import tiktok_downloader
from loguru import logger
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class TiktokArchiver(Archiver):
@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
logger.warning(f'Other Tiktok error {error}')
try:
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
tiktok_media = tiktok_downloader.snaptik(url).get_media()
if len(tiktok_media) <= 0:

Wyświetl plik

@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
from ..utils.misc import dump_payload
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class VkArchiver(Archiver):
@ -50,7 +50,7 @@ class VkArchiver(Archiver):
result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir())
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
for filename in filenames:
result.add_media(Media(filename))

Wyświetl plik

@ -2,7 +2,7 @@ import datetime, os, yt_dlp
from loguru import logger
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class YoutubeDLArchiver(Archiver):
@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
try:
# don'd download since it can be a live stream

Wyświetl plik

@ -1,6 +1,7 @@
from .media import Media
from .metadata import Metadata
from .step import Step
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

Wyświetl plik

@ -0,0 +1,38 @@
class ArchivingContext:
"""
Singleton context class.
ArchivingContext._get_instance() to retrieve it if needed
otherwise just
ArchivingContext.set(key, value)
and
ArchivingContext.get(key, default)
"""
_instance = None
def __init__(self):
self.configs = {}
@staticmethod
def get_instance():
if ArchivingContext._instance is None:
ArchivingContext._instance = ArchivingContext()
return ArchivingContext._instance
@staticmethod
def set(key, value):
ArchivingContext.get_instance().configs[key] = value
@staticmethod
def get(key: str, default=None):
return ArchivingContext.get_instance().configs.get(key, default)
# ---- custom getters/setters for widely used context values
@staticmethod
def set_tmp_dir(tmp_dir: str):
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
@staticmethod
def get_tmp_dir() -> str:
return ArchivingContext.get_instance().configs.get("tmp_dir")

Wyświetl plik

@ -6,8 +6,9 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
import mimetypes
# annotation order matters
@dataclass_json
@dataclass_json # annotation order matters
@dataclass
class Media:
filename: str
@ -40,3 +41,13 @@ class Media:
def is_video(self) -> bool:
return self.mimetype.startswith("video")
def is_audio(self) -> bool:
return self.mimetype.startswith("audio")
def store(self):
"""
either stores this media entry and all its media descendants
or returns if that process is already completed
"""
pass

Wyświetl plik

@ -3,13 +3,12 @@ from __future__ import annotations
from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
from dataclasses_json import dataclass_json, config
import datetime
from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt
from .media import Media
# annotation order matters
@dataclass_json
@dataclass
@ -17,10 +16,14 @@ class Metadata:
status: str = "no archiver"
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
rearchivable: bool = True # defaults to true, archivers can overwrite
# properties below are excluded from JSON representation
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True))
# tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
@ -93,12 +96,6 @@ class Metadata:
def get_title(self) -> str:
return self.get("title")
def set_tmp_dir(self, tmp_dir: str) -> Metadata:
return self.set("tmp_dir", tmp_dir, True)
def get_tmp_dir(self) -> str:
return self.get("tmp_dir")
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
if type(timestamp) == str:
timestamp = parse_dt(timestamp)
@ -144,3 +141,7 @@ class Metadata:
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
**{"processed_at": self._processed_at}
)
def __str__(self) -> str:
return self.__repr__()

Wyświetl plik

@ -2,6 +2,8 @@ from __future__ import annotations
from ast import List
from typing import Union
from .context import ArchivingContext
from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
@ -23,6 +25,7 @@ class ArchivingOrchestrator:
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages
ArchivingContext.set("storages", self.storages)
for a in self.archivers: a.setup()
@ -33,7 +36,7 @@ class ArchivingOrchestrator:
def feed_item(self, item: Metadata) -> Metadata:
try:
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir)
ArchivingContext.set_tmp_dir(tmp_dir)
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit

Wyświetl plik

@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException
from . import Enricher
from ..utils import Webdriver, UrlUtil
from ..core import Media, Metadata
from ..core import Media, Metadata, ArchivingContext
class ScreenshotEnricher(Enricher):
name = "screenshot_enricher"
@ -29,7 +29,7 @@ class ScreenshotEnricher(Enricher):
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
except TimeoutException:

Wyświetl plik

@ -2,7 +2,7 @@ import ffmpeg, os, uuid
from loguru import logger
from . import Enricher
from ..core import Media, Metadata
from ..core import Media, Metadata, ArchivingContext
class ThumbnailEnricher(Enricher):
@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails")
for i, m in enumerate(to_enrich.media[::]):
if m.is_video():
folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}")
fps, duration = 0.5, m.get("duration")

Wyświetl plik

@ -1,7 +1,7 @@
import os, shutil, subprocess, uuid
from loguru import logger
from ..core import Media, Metadata
from ..core import Media, Metadata, ArchivingContext
from . import Enricher
from ..utils import UrlUtil
@ -34,7 +34,7 @@ class WaczEnricher(Enricher):
logger.debug(f"generating WACZ for {url=}")
collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running

Wyświetl plik

@ -6,7 +6,7 @@ from urllib.parse import quote
from loguru import logger
from ..version import __version__
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
from . import Formatter
@ -43,7 +43,7 @@ class HtmlFormatter(Formatter):
metadata=item.get_clean_metadata(),
version=__version__
)
html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
return Media(filename=html_path)

Wyświetl plik

@ -1,6 +1,6 @@
_MAJOR = "0"
_MINOR = "4"
_MINOR = "5"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "5"