More manifests, base modules and rename from archiver to extractor.

pull/183/head
erinhmclark 2025-01-23 16:40:48 +00:00
rodzic 9db26cdfc2
commit 1274a1b231
93 zmienionych plików z 378 dodań i 238 usunięć

Wyświetl plik

@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:1.0.4 AS base
FROM webrecorder/browsertrix-crawler:1.4.2 AS base
ENV RUNNING_IN_DOCKER=1 \
LANG=C.UTF-8 \
@ -22,28 +22,30 @@ RUN add-apt-repository ppa:mozillateam/ppa && \
# Poetry and runtime
FROM base AS runtime
FROM base AS poetry-env
ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1
RUN pip install --upgrade pip && \
pip install "poetry>=2.0.0,<3.0.0"
# Create a virtual environment for poetry and install it
RUN python3 -m venv /poetry-venv && \
/poetry-venv/bin/python -m pip install --upgrade pip && \
/poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0"
WORKDIR /app
COPY pyproject.toml poetry.lock README.md ./
# Copy dependency files and install dependencies (excluding the package itself)
RUN poetry install --only main --no-root --no-cache
RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache
# Copy code: This is needed for poetry to install the package itself,
# but the environment should be cached from the previous step if toml and lock files haven't changed
COPY ./src/ .
RUN poetry install --only main --no-cache
RUN /poetry-venv/bin/poetry install --only main --no-cache
# Update PATH to include virtual environment binaries
@ -55,4 +57,3 @@ ENTRYPOINT ["python3", "-m", "auto_archiver"]
# should be executed with 2 volumes (3 if local_storage is used)
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml

Wyświetl plik

@ -1,8 +0,0 @@
"""
Archivers are responsible for retrieving the content from various external platforms.
They act as specialized modules, each tailored to interact with a specific platform,
service, or data source. The archivers collectively enable the tool to comprehensively
collect and preserve a variety of content types, such as posts, images, videos and metadata.
"""
from .archiver import Archiver

Wyświetl plik

@ -0,0 +1,6 @@
from .database import Database
from .enricher import Enricher
from .feeder import Feeder
from .storage import Storage
from .extractor import Extractor
from .formatter import Formatter

Wyświetl plik

@ -3,13 +3,13 @@ from dataclasses import dataclass
from abc import abstractmethod, ABC
from typing import Union
from ..core import Metadata, Step
from auto_archiver.core import Metadata, Step
@dataclass
class Database(Step, ABC):
name = "database"
name = "database"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)

Wyświetl plik

@ -0,0 +1,31 @@
"""
Enrichers are modular components that enhance archived content by adding
context, metadata, or additional processing.
These add additional information to the context, such as screenshots, hashes, and metadata.
They are designed to work within the archiving pipeline, operating on `Metadata` objects after
the archiving step and before storage or formatting.
Enrichers are optional but highly useful for making the archived data more powerful.
"""
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from auto_archiver.core import Metadata, Step
@dataclass
class Enricher(Step, ABC):
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
name = "enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
# only for typing...
def init(name: str, config: dict) -> Enricher:
return Step.init(name, config, Enricher)
@abstractmethod
def enrich(self, to_enrich: Metadata) -> None: pass

Wyświetl plik

@ -1,7 +1,7 @@
""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework.
This class provides common utility methods and a standard interface for archivers.
""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
This class provides common utility methods and a standard interface for extractors.
Factory method to initialize an archiver instance based on its name.
Factory method to initialize an extractor instance based on its name.
"""
@ -15,32 +15,32 @@ import mimetypes, requests
from loguru import logger
from retrying import retry
from ..core import Metadata, Step, ArchivingContext
from ..core import Metadata, ArchivingContext
@dataclass
class Archiver:
class Extractor:
"""
Base class for implementing archivers in the media archiving framework.
Base class for implementing extractors in the media archiving framework.
Subclasses must implement the `download` method to define platform-specific behavior.
"""
def setup(self) -> None:
# used when archivers need to login or do other one-time setup
# used when extractors need to login or do other one-time setup
pass
def cleanup(self) -> None:
# called when archivers are done, or upon errors, cleanup any resources
# called when extractors are done, or upon errors, cleanup any resources
pass
def sanitize_url(self, url: str) -> str:
# used to clean unnecessary URL parameters OR unfurl redirect links
return url
def suitable(self, url: str) -> bool:
"""
Returns True if this archiver can handle the given URL
Returns True if this extractor can handle the given URL
Should be overridden by subclasses
"""
return True
@ -84,10 +84,10 @@ class Archiver:
for chunk in d.iter_content(chunk_size=8192):
f.write(chunk)
return to_filename
except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {e}")
@abstractmethod
def download(self, item: Metadata) -> Metadata:
pass
pass

Wyświetl plik

@ -1,8 +1,8 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
from ..core import Metadata
from ..core import Step
from auto_archiver.core import Metadata
from auto_archiver.core import Step
@dataclass

Wyświetl plik

@ -1,7 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
from ..core import Metadata, Media, Step
from auto_archiver.core import Metadata, Media, Step
@dataclass

Wyświetl plik

@ -4,10 +4,10 @@ from dataclasses import dataclass
from typing import IO, Optional
import os
from ..utils.misc import random_str
from auto_archiver.utils.misc import random_str
from ..core import Media, Step, ArchivingContext, Metadata
from ..enrichers import HashEnricher
from auto_archiver.core import Media, Step, ArchivingContext, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from loguru import logger
from slugify import slugify

Wyświetl plik

@ -8,9 +8,4 @@ from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator
# from .config import Config
from .media import Media
from .step import Step
from .context import ArchivingContext
from .metadata import Metadata
# from .config import Config

Wyświetl plik

@ -15,7 +15,7 @@ from .loader import MODULE_TYPES
# configurable_parents = [
# Feeder,
# Enricher,
# Archiver,
# Extractor,
# Database,
# Storage,
# Formatter
@ -23,7 +23,7 @@ from .loader import MODULE_TYPES
# ]
# feeder: Feeder
# formatter: Formatter
# archivers: List[Archiver] = field(default_factory=[])
# extractors: List[Extractor] = field(default_factory=[])
# enrichers: List[Enricher] = field(default_factory=[])
# storages: List[Storage] = field(default_factory=[])
# databases: List[Database] = field(default_factory=[])

Wyświetl plik

@ -33,7 +33,7 @@ class ArchivingOrchestrator:
# self.feeder: Feeder = config.feeder
# self.formatter: Formatter = config.formatter
# self.enrichers: List[Enricher] = config.enrichers
# self.archivers: List[Archiver] = config.archivers
# self.extractors: List[Extractor] = config.extractors
# self.databases: List[Database] = config.databases
# self.storages: List[Storage] = config.storages
# ArchivingContext.set("storages", self.storages, keep_on_reset=True)
@ -80,7 +80,7 @@ class ArchivingOrchestrator:
for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'extractors', 'databases', 'storages', 'formatter'
for module_type in MODULE_TYPES:
if modules := getattr(basic_config, f"{module_type}s", []):
enabled_modules.extend(modules)
@ -98,7 +98,7 @@ class ArchivingOrchestrator:
self.add_module_args(available_modules(with_manifest=True), parser)
breakpoint()
# breakpoint()
parser.set_defaults(**to_dot_notation(yaml_config))
# reload the parser with the new arguments, now that we have them
@ -165,7 +165,8 @@ class ArchivingOrchestrator:
for module_type in MODULE_TYPES:
if module_type == 'enricher':
breakpoint()
pass
# breakpoint()
step_items = []
modules_to_load = self.config['steps'][f"{module_type}s"]
@ -228,7 +229,7 @@ class ArchivingOrchestrator:
def cleanup(self)->None:
logger.info("Cleaning up")
for e in self.config['steps']['extractors']:
breakpoint()
# breakpoint()
e.cleanup()
def feed(self) -> Generator[Metadata]:

Wyświetl plik

@ -1,5 +0,0 @@
""" Databases are used to store the outputs from running the Autp Archiver.
"""
from .database import Database

Wyświetl plik

@ -1,12 +0,0 @@
"""
Enrichers are modular components that enhance archived content by adding
context, metadata, or additional processing.
These add additional information to the context, such as screenshots, hashes, and metadata.
They are designed to work within the archiving pipeline, operating on `Metadata` objects after
the archiving step and before storage or formatting.
Enrichers are optional but highly useful for making the archived data more powerful.
"""

Wyświetl plik

@ -1,22 +0,0 @@
""" Base classes and utilities for enrichers in the Auto-Archiver system.
"""
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from ..core import Metadata, Step
@dataclass
class Enricher(Step, ABC):
name = "enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
# only for typing...
def init(name: str, config: dict) -> Enricher:
return Step.init(name, config, Enricher)
@abstractmethod
def enrich(self, to_enrich: Metadata) -> None: pass

Wyświetl plik

@ -1,3 +0,0 @@
""" Feeders handle the input of media into the Auto Archiver.
"""

Wyświetl plik

@ -1 +0,0 @@
""" Formatters for the output of the content. """

Wyświetl plik

@ -2,7 +2,7 @@ from typing import Union
import requests, os
from loguru import logger
from auto_archiver.databases import Database
from auto_archiver.base_modules import Database
from auto_archiver.core import Metadata

Wyświetl plik

@ -0,0 +1,38 @@
{
"name": "atlos_storage",
"type": ["storage"],
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "requests"],
"bin": [""]
},
"configs": {
# TODO: get base storage configs
# TODO also? get_atlos_config_options()
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"cli_set": lambda cli_val, _: cli_val
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"cli_set": lambda cli_val, _: cli_val
},
},
"description": """
AtlosStorage: A storage module for saving media files to the Atlos platform.
### Features
- Uploads media files to Atlos using Atlos-specific APIs.
- Automatically calculates SHA-256 hashes of media files for integrity verification.
- Skips uploads for files that already exist on Atlos with the same hash.
- Supports attaching metadata, such as `atlos_id`, to the uploaded files.
- Provides CDN-like URLs for accessing uploaded media.
### Notes
- Requires Atlos API configuration, including `atlos_url` and `api_token`.
- Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
"""
}

Wyświetl plik

@ -4,9 +4,9 @@ from loguru import logger
import requests
import hashlib
from ..core import Media, Metadata
from ..storages import Storage
from ..utils import get_atlos_config_options
from auto_archiver.core import Media, Metadata
from auto_archiver.base_modules import Storage
from auto_archiver.utils import get_atlos_config_options
class AtlosStorage(Storage):

Wyświetl plik

@ -1,11 +1,12 @@
import os
from typing import Union
from loguru import logger
from csv import DictWriter
from dataclasses import asdict
import requests
from auto_archiver.databases import Database
from auto_archiver.base_modules import Database
from auto_archiver.core import Metadata
from auto_archiver.utils import get_atlos_config_options

Wyświetl plik

@ -0,0 +1,13 @@
def get_atlos_config_options():
return {
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"cli_set": lambda cli_val, _: cli_val
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"cli_set": lambda cli_val, _: cli_val
},
}

Wyświetl plik

@ -1,7 +1,7 @@
from loguru import logger
import requests
from auto_archiver.feeders import Feeder
from auto_archiver.base_modules import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import get_atlos_config_options

Wyświetl plik

@ -1,6 +1,6 @@
from loguru import logger
from auto_archiver.feeders import Feeder
from auto_archiver.base_modules import Feeder
from auto_archiver.core import Metadata, ArchivingContext

Wyświetl plik

@ -1,6 +1,6 @@
from loguru import logger
from auto_archiver.databases import Database
from auto_archiver.base_modules import Database
from auto_archiver.core import Metadata

Wyświetl plik

@ -3,7 +3,7 @@ from loguru import logger
from csv import DictWriter
from dataclasses import asdict
from auto_archiver.databases import Database
from auto_archiver.base_modules import Database
from auto_archiver.core import Metadata

Wyświetl plik

@ -1,7 +1,7 @@
from loguru import logger
import csv
from auto_archiver.feeders import Feeder
from auto_archiver.base_modules import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import url_or_none

Wyświetl plik

@ -0,0 +1,34 @@
m = {
"name": "Google Drive Storage",
"type": ["storage"],
"requires_setup": True,
"external_dependencies": {
"python": [
"loguru",
"google-api-python-client",
"google-auth",
"google-auth-oauthlib",
"google-auth-httplib2"
],
},
"configs": {
# TODO: get base storage configs
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
},
"description": """
GDriveStorage: A storage module for saving archived content to Google Drive.
### Features
- Saves media files to Google Drive, organizing them into folders based on the provided path structure.
- Supports OAuth token-based authentication or service account credentials for API access.
- Automatically creates folders in Google Drive if they don't exist.
- Retrieves CDN URLs for stored files, enabling easy sharing and access.
### Notes
- Requires setup with either a Google OAuth token or a service account JSON file.
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
- Automatically handles Google Drive API token refreshes for long-running jobs.
"""
}

Wyświetl plik

@ -9,8 +9,8 @@ from google.oauth2 import service_account
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from ..core import Media
from . import Storage
from auto_archiver.core import Media
from auto_archiver.base_modules import Storage
class GDriveStorage(Storage):

Wyświetl plik

@ -1,17 +1,12 @@
import os
import mimetypes
import requests
from loguru import logger
from auto_archiver.core.context import ArchivingContext
from auto_archiver.archivers.archiver import Archiver
from auto_archiver.base_modules.extractor import Extractor
from auto_archiver.core.metadata import Metadata, Media
from .dropin import GenericDropin, InfoExtractor
class Bluesky(GenericDropin):
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
result = Metadata()
result.set_url(url)
result.set_title(post["record"]["text"])
@ -42,7 +37,7 @@ class Bluesky(GenericDropin):
def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
"""
Iterates over image(s) or video in a Bluesky post and downloads them
"""

Wyświetl plik

@ -1,6 +1,6 @@
from yt_dlp.extractor.common import InfoExtractor
from auto_archiver.core.metadata import Metadata
from auto_archiver.archivers.archiver import Archiver
from auto_archiver.base_modules.extractor import Extractor
class GenericDropin:
"""Base class for dropins for the generic extractor.
@ -30,7 +30,7 @@ class GenericDropin:
raise NotImplementedError("This method should be implemented in the subclass")
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
"""
This method should create a Metadata object from the post data.
"""

Wyświetl plik

@ -5,10 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor
from loguru import logger
from auto_archiver.archivers.archiver import Archiver
from auto_archiver.base_modules.extractor import Extractor
from ...core import Metadata, Media, ArchivingContext
class GenericExtractor(Archiver):
class GenericExtractor(Extractor):
name = "youtubedl_archiver" #left as is for backwards compat
_dropins = {}

Wyświetl plik

@ -2,7 +2,7 @@ from typing import Type
from auto_archiver.utils import traverse_obj
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.archivers.archiver import Archiver
from auto_archiver.base_modules.extractor import Extractor
from yt_dlp.extractor.common import InfoExtractor
from dateutil.parser import parse as parse_dt
@ -19,7 +19,7 @@ class Truth(GenericDropin):
def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
return True
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
"""
Creates metadata from a truth social post

Wyświetl plik

@ -6,7 +6,7 @@ from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import UrlUtil
from auto_archiver.archivers.archiver import Archiver
from auto_archiver.base_modules.extractor import Extractor
from .dropin import GenericDropin, InfoExtractor
@ -32,7 +32,7 @@ class Twitter(GenericDropin):
twid = ie_instance._match_valid_url(url).group('id')
return ie_instance._extract_status(twid=twid)
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
result = Metadata()
try:
if not tweet.get("user") or not tweet.get("created_at"):

Wyświetl plik

@ -1,21 +0,0 @@
# TODO merge with feeder manifest?
{
"name": "gsheet_db",
"type": ["database"],
"requires_setup": True,
"external_dependencies": {"python": [" loguru"],
},
"description": """
Handles integration with Google Sheets for tracking archival tasks.
### Features
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
- Skips redundant updates for empty or invalid data fields.
### Notes
- Currently works only with metadata provided by GsheetFeeder.
- Requires configuration of a linked Google Sheet and appropriate API credentials.
""",
}

Wyświetl plik

@ -1,5 +1,5 @@
{
"name": "Google Sheets Feeder",
"name": "Google Sheets Procesor",
"type": ["feeder"],
"requires_setup": True,
"external_dependencies": {
@ -22,7 +22,12 @@
}
},
"description": """
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
Google Sheets Module.
Handles feeding from a google sheet as well as an optional write back to the sheet.
## GsheetsFeeder
A Google Sheets-based feeder for the Auto Archiver.
This reads data from Google Sheets and filters rows based on user-defined rules.
The filtered rows are processed into `Metadata` objects.
@ -36,5 +41,18 @@
### Notes
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
- Create the sheet using the template provided in the docs.
## GsheetsDatabase:
Handles integration with Google Sheets for tracking archival tasks.
### Features
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
- Skips redundant updates for empty or invalid data fields.
### Notes
- Currently works only with metadata provided by GsheetFeeder.
- Requires configuration of a linked Google Sheet and appropriate API credentials.
"""
}

Wyświetl plik

@ -1,10 +1,11 @@
from typing import Union, Tuple
import datetime
from urllib.parse import quote
from loguru import logger
from auto_archiver.databases import Database
from auto_archiver.base_modules import Database
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import GWorksheet

Wyświetl plik

@ -13,8 +13,7 @@ import gspread, os
from loguru import logger
from slugify import slugify
# from . import Enricher
from auto_archiver.feeders import Feeder
from auto_archiver.base_modules import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import Gsheets, GWorksheet

Wyświetl plik

@ -0,0 +1 @@
from hash_enricher import HashEnricher

Wyświetl plik

@ -7,7 +7,7 @@
},
"configs": {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
"chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
},
"description": """
Generates cryptographic hashes for media files to ensure data integrity and authenticity.

Wyświetl plik

@ -10,7 +10,7 @@ making it suitable for handling large files efficiently.
import hashlib
from loguru import logger
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Metadata, ArchivingContext
@ -40,7 +40,11 @@ class HashEnricher(Enricher):
else:
self.chunksize = self.configs()["chunksize"]["default"]
self.chunksize = int(self.chunksize)
try:
self.chunksize = int(self.chunksize)
except ValueError:
raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.")
assert self.chunksize >= -1, "read length must be non-negative or -1"
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)

Wyświetl plik

@ -0,0 +1,13 @@
m = {
"name": "HTML Formatter",
"type": ["formatter"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "jinja2"],
"bin": [""]
},
"configs": {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
},
"description": """ """,
}

Wyświetl plik

@ -7,11 +7,11 @@ from loguru import logger
import json
import base64
from ..version import __version__
from ..core import Metadata, Media, ArchivingContext
from . import Formatter
from ..enrichers import HashEnricher
from ..utils.misc import random_str
from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.base_modules import Formatter
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
@dataclass
@ -28,11 +28,11 @@ class HtmlFormatter(Formatter):
})
self.template = self.environment.get_template("html_template.html")
@staticmethod
def configs() -> dict:
return {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
}
# @staticmethod
# def configs() -> dict:
# return {
# "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
# }
def format(self, item: Metadata) -> Media:
url = item.get_url()

Wyświetl plik

@ -1,7 +1,6 @@
{
"name": "Instagram API Archiver",
"name": "Instagram API Extractor",
"type": ["extractor"],
"entry_point": "instagram_api_archiver:InstagramApiArchiver",
"external_dependencies":
{"python": ["requests",
"loguru",

Wyświetl plik

@ -1,5 +1,5 @@
"""
The `instagram_api_archiver` module provides tools for archiving various types of Instagram content
The `instagram_api_extractor` module provides tools for archiving various types of Instagram content
using the [Instagrapi API](https://github.com/subzeroid/instagrapi).
Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles,
@ -16,19 +16,19 @@ from loguru import logger
from retrying import retry
from tqdm import tqdm
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Media
from auto_archiver.core import Metadata
class InstagramAPIArchiver(Archiver):
class InstagramAPIExtractor(Extractor):
"""
Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
"""
name = "instagram_api_archiver"
name = "instagram_api_extractor"
global_pattern = re.compile(
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"

Wyświetl plik

@ -1,7 +1,6 @@
{
"name": "Instagram Archiver",
"name": "Instagram Extractor",
"type": ["extractor"],
"entry_point": "instagram_archiver:InstagramArchiver",
"external_dependencies": {
"python": [
"instaloader",

Wyświetl plik

@ -7,15 +7,15 @@ import re, os, shutil, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata
from auto_archiver.core import Media
class InstagramArchiver(Archiver):
class InstagramExtractor(Extractor):
"""
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
name = "instagram_archiver"
name = "instagram_extractor"
# NB: post regex should be tested before profile
# https://regex101.com/r/MGPquX/1
@ -67,7 +67,7 @@ class InstagramArchiver(Archiver):
elif len(profile_matches):
result = self.download_profile(url, profile_matches[0])
except Exception as e:
logger.error(f"Failed to download with instagram archiver due to: {e}, make sure your account credentials are valid.")
logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.")
finally:
shutil.rmtree(self.download_folder, ignore_errors=True)
return result

Wyświetl plik

@ -1,7 +1,6 @@
{
"name": "Instagram Telegram Bot Archiver",
"name": "Instagram Telegram Bot Extractor",
"type": ["extractor"],
"entry_point": "instagram_tbot_archiver:InstagramTbotArchiver",
"external_dependencies": {"python": ["loguru",
"telethon",],
},
@ -13,7 +12,7 @@
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
},
"description": """
The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs
to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and
returned as part of a `Metadata` object.
@ -26,7 +25,7 @@ returned as part of a `Metadata` object.
### Setup
To use the `InstagramTbotArchiver`, you need to provide the following configuration settings:
To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
- **Session File**: Optional path to store the Telegram session file for future use.

Wyświetl plik

@ -1,5 +1,5 @@
"""
InstagramTbotArchiver Module
InstagramTbotExtractor Module
This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`).
It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the
@ -15,18 +15,18 @@ from sqlite3 import OperationalError
from loguru import logger
from telethon.sync import TelegramClient
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import random_str
class InstagramTbotArchiver(Archiver):
class InstagramTbotExtractor(Extractor):
"""
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
https://github.com/adw0rd/instagrapi
https://t.me/instagram_load_bot
"""
name = "instagram_tbot_archiver"
name = "instagram_tbot_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)
@ -49,7 +49,7 @@ class InstagramTbotArchiver(Archiver):
try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
except OperationalError as e:
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
with self.client.start():
logger.success(f"SETUP {self.name} login works.")

Wyświetl plik

@ -0,0 +1,26 @@
m = {
"name": "Local Storage",
"type": ["storage"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru"],
},
"configs": {
# TODO: get base storage configs
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
},
"description": """
LocalStorage: A storage module for saving archived content locally on the filesystem.
### Features
- Saves archived media files to a specified folder on the local filesystem.
- Maintains file metadata during storage using `shutil.copy2`.
- Supports both absolute and relative paths for stored files, configurable via `save_absolute`.
- Automatically creates directories as needed for storing files.
### Notes
- Default storage folder is `./archived`, but this can be changed via the `save_to` configuration.
- The `save_absolute` option can reveal the file structure in output formats; use with caution.
"""
}

Wyświetl plik

@ -4,8 +4,8 @@ from typing import IO
import os
from loguru import logger
from ..core import Media
from ..storages import Storage
from auto_archiver.core import Media
from auto_archiver.base_modules import Storage
class LocalStorage(Storage):

Wyświetl plik

@ -2,7 +2,7 @@ import datetime
import os
from loguru import logger
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Metadata

Wyświetl plik

@ -2,7 +2,7 @@ import subprocess
import traceback
from loguru import logger
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Metadata

Wyświetl plik

@ -0,0 +1,9 @@
m = {
"name": "Mute Formatter",
"type": ["formatter"],
"requires_setup": False,
"external_dependencies": {
},
"description": """ Default formatter.
""",
}

Wyświetl plik

@ -16,7 +16,7 @@ import numpy as np
from PIL import Image, UnidentifiedImageError
from loguru import logger
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Metadata

Wyświetl plik

@ -0,0 +1,40 @@
m = {
"name": "S3 Storage",
"type": ["storage"],
"requires_setup": True,
"external_dependencies": {
"python": ["boto3", "loguru"],
},
"configs": {
# TODO: get base storage configs
"bucket": {"default": None, "help": "S3 bucket name"},
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"
},
"cdn_url": {
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
},
"private": {"default": False, "help": "if true S3 files will not be readable online"},
},
"description": """
S3Storage: A storage module for saving media files to an S3-compatible object storage.
### Features
- Uploads media files to an S3 bucket with customizable configurations.
- Supports `random_no_duplicate` mode to avoid duplicate uploads by checking existing files based on SHA-256 hashes.
- Automatically generates unique paths for files when duplicates are found.
- Configurable endpoint and CDN URL for different S3-compatible providers.
- Supports both private and public file storage, with public files being readable online.
### Notes
- Requires S3 credentials (API key and secret) and a bucket name to function.
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
- Uses `boto3` for interaction with the S3 API.
"""
}

Wyświetl plik

@ -2,10 +2,11 @@
from typing import IO
import boto3, os
from ..utils.misc import random_str
from ..core import Media
from ..storages import Storage
from ..enrichers import HashEnricher
from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media
from auto_archiver.base_modules import Storage
# TODO
from auto_archiver.modules.hash_enricher import HashEnricher
from loguru import logger
NO_DUPLICATES_FOLDER = "no-dups/"

Wyświetl plik

@ -5,7 +5,7 @@ import base64
from selenium.common.exceptions import TimeoutException
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.utils import Webdriver, UrlUtil, random_str
from auto_archiver.core import Media, Metadata, ArchivingContext

Wyświetl plik

@ -3,7 +3,7 @@ from slugify import slugify
from urllib.parse import urlparse
from loguru import logger
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media

Wyświetl plik

@ -1,7 +1,6 @@
{
"name": "Telegram Archiver",
"name": "Telegram Extractor",
"type": ["extractor"],
"entry_point": "telegram_archiver:TelegramArchiver",
"requires_setup": False,
"external_dependencies": {
"python": [
@ -11,7 +10,7 @@
],
},
"description": """
The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials.
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
is advised for more comprehensive functionality.

Wyświetl plik

@ -2,16 +2,16 @@ import requests, re, html
from bs4 import BeautifulSoup
from loguru import logger
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata, Media
class TelegramArchiver(Archiver):
class TelegramExtractor(Extractor):
"""
Archiver for telegram that does not require login, but the telethon_archiver is much more advised,
Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
will only return if at least one image or one video is found
"""
name = "telegram_archiver"
name = "telegram_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)

Wyświetl plik

@ -1,8 +1,7 @@
# TODO rm dependency on json
{
"name": "telethon_archiver",
"name": "telethon_extractor",
"type": ["extractor"],
"entry_point": "telethon_archiver:TelethonArchiver",
"requires_setup": True,
"external_dependencies": {
"python": ["telethon",
@ -25,7 +24,7 @@
}
},
"description": """
The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups.
The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups.
It supports private and public channels, downloading grouped posts with media, and can join channels using invite links
if provided in the configuration.
@ -37,7 +36,7 @@ if provided in the configuration.
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `TelethonArchiver`, you must configure the following:
To use the `TelethonExtractor`, you must configure the following:
- **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps).
- **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`).
- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.

Wyświetl plik

@ -8,13 +8,13 @@ from loguru import logger
from tqdm import tqdm
import re, time, json, os
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import random_str
class TelethonArchiver(Archiver):
name = "telethon_archiver"
class TelethonArchiver(Extractor):
name = "telethon_extractor"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")

Wyświetl plik

@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
import ffmpeg, os
from loguru import logger
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.utils.misc import random_str

Wyświetl plik

@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
from asn1crypto import pem
import certifi
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
class TimestampingEnricher(Enricher):

Wyświetl plik

@ -1,7 +1,6 @@
{
"name": "Twitter API Archiver",
"name": "Twitter API Extractor",
"type": ["extractor"],
"entry_point": "twitter_api_archiver:TwitterApiArchiver",
"requires_setup": True,
"external_dependencies": {
"python": ["requests",
@ -20,7 +19,7 @@
"access_secret": {"default": None, "help": "twitter API access_secret"},
},
"description": """
The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API.
The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API.
It supports multiple API configurations for extended rate limits and reliable access.
Features include URL expansion, media downloads (e.g., images, videos), and structured output
via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens
@ -34,7 +33,7 @@
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration:
To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration:
- **Bearer Token(s)**: A single token or a list for rate-limited API access.
- **Consumer Key and Secret**: Required for user-authenticated API access.
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.

Wyświetl plik

@ -8,11 +8,11 @@ from loguru import logger
from pytwitter import Api
from slugify import slugify
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata,Media
class TwitterApiArchiver(Archiver):
name = "twitter_api_archiver"
class TwitterApiExtractor(Extractor):
name = "twitter_api_extractor"
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def __init__(self, config: dict) -> None:

Wyświetl plik

@ -1,7 +1,6 @@
{
"name": "VKontakte Archiver",
"name": "VKontakte Extractor",
"type": ["extractor"],
"entry_point": "vk_archiver:VKArchiver",
"requires_setup": True,
"depends": ["core", "utils"],
"external_dependencies": {
@ -14,7 +13,7 @@
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
},
"description": """
The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages.
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
and download content. Note that VK videos are handled separately by the `YTDownloader`.

Wyświetl plik

@ -2,16 +2,16 @@ from loguru import logger
from vk_url_scraper import VkScraper
from auto_archiver.utils.misc import dump_payload
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
class VkArchiver(Archiver):
class VkExtractor(Extractor):
""""
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
"""
name = "vk_archiver"
name = "vk_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)

Wyświetl plik

@ -6,12 +6,11 @@ from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.enrichers import Enricher
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor, Enricher
from auto_archiver.utils import UrlUtil, random_str
class WaczArchiverEnricher(Enricher, Archiver):
class WaczExtractorEnricher(Enricher, Extractor):
"""
Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL
If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)

Wyświetl plik

@ -2,12 +2,11 @@ import json
from loguru import logger
import time, requests
from auto_archiver.enrichers import Enricher
from auto_archiver.archivers import Archiver
from auto_archiver.base_modules import Extractor, Enricher
from auto_archiver.utils import UrlUtil
from auto_archiver.core import Metadata
class WaybackArchiverEnricher(Enricher, Archiver):
class WaybackExtractorEnricher(Enricher, Extractor):
"""
Submits the current URL to the webarchive and returns a job_id or completed archive.

Wyświetl plik

@ -2,9 +2,9 @@ import traceback
import requests, time
from loguru import logger
from auto_archiver.enrichers import Enricher
from auto_archiver.base_modules import Enricher
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.storages import S3Storage
from auto_archiver.modules import S3Storage
class WhisperEnricher(Enricher):

Wyświetl plik

@ -1,3 +0,0 @@
""" This module contains the storage classes for the auto-archiver.
"""

Wyświetl plik

@ -1,9 +1,7 @@
import pytest
from auto_archiver.core import Metadata
from auto_archiver.core import Step
from auto_archiver.core.metadata import Metadata
from auto_archiver.archivers.archiver import Archiver
from auto_archiver.base_modules.extractor import Extractor
class TestArchiverBase(object):
archiver_class: str = None
@ -13,7 +11,7 @@ class TestArchiverBase(object):
def setup_archiver(self):
assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.archiver: Archiver = self.archiver_class({self.archiver_class.name: self.config})
self.archiver: Extractor = self.archiver_class({self.archiver_class.name: self.config})
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
assert test_response is not False

Wyświetl plik

@ -1,5 +1,4 @@
from auto_archiver.core.context import ArchivingContext
from auto_archiver.formatters.html_formatter import HtmlFormatter
from auto_archiver.modules.html_formatter import HtmlFormatter
from auto_archiver.core import Metadata, Media