kopia lustrzana https://github.com/bellingcat/auto-archiver
More manifests, base modules and rename from archiver to extractor.
rodzic
9db26cdfc2
commit
1274a1b231
15
Dockerfile
15
Dockerfile
|
@ -1,4 +1,4 @@
|
|||
FROM webrecorder/browsertrix-crawler:1.0.4 AS base
|
||||
FROM webrecorder/browsertrix-crawler:1.4.2 AS base
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1 \
|
||||
LANG=C.UTF-8 \
|
||||
|
@ -22,28 +22,30 @@ RUN add-apt-repository ppa:mozillateam/ppa && \
|
|||
|
||||
|
||||
# Poetry and runtime
|
||||
FROM base AS runtime
|
||||
FROM base AS poetry-env
|
||||
|
||||
ENV POETRY_NO_INTERACTION=1 \
|
||||
POETRY_VIRTUALENVS_IN_PROJECT=1 \
|
||||
POETRY_VIRTUALENVS_CREATE=1
|
||||
|
||||
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install "poetry>=2.0.0,<3.0.0"
|
||||
# Create a virtual environment for poetry and install it
|
||||
RUN python3 -m venv /poetry-venv && \
|
||||
/poetry-venv/bin/python -m pip install --upgrade pip && \
|
||||
/poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
COPY pyproject.toml poetry.lock README.md ./
|
||||
# Copy dependency files and install dependencies (excluding the package itself)
|
||||
RUN poetry install --only main --no-root --no-cache
|
||||
RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache
|
||||
|
||||
|
||||
# Copy code: This is needed for poetry to install the package itself,
|
||||
# but the environment should be cached from the previous step if toml and lock files haven't changed
|
||||
COPY ./src/ .
|
||||
RUN poetry install --only main --no-cache
|
||||
RUN /poetry-venv/bin/poetry install --only main --no-cache
|
||||
|
||||
|
||||
# Update PATH to include virtual environment binaries
|
||||
|
@ -55,4 +57,3 @@ ENTRYPOINT ["python3", "-m", "auto_archiver"]
|
|||
|
||||
# should be executed with 2 volumes (3 if local_storage is used)
|
||||
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
|
||||
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
"""
|
||||
Archivers are responsible for retrieving the content from various external platforms.
|
||||
They act as specialized modules, each tailored to interact with a specific platform,
|
||||
service, or data source. The archivers collectively enable the tool to comprehensively
|
||||
collect and preserve a variety of content types, such as posts, images, videos and metadata.
|
||||
|
||||
"""
|
||||
from .archiver import Archiver
|
|
@ -0,0 +1,6 @@
|
|||
from .database import Database
|
||||
from .enricher import Enricher
|
||||
from .feeder import Feeder
|
||||
from .storage import Storage
|
||||
from .extractor import Extractor
|
||||
from .formatter import Formatter
|
|
@ -3,13 +3,13 @@ from dataclasses import dataclass
|
|||
from abc import abstractmethod, ABC
|
||||
from typing import Union
|
||||
|
||||
from ..core import Metadata, Step
|
||||
from auto_archiver.core import Metadata, Step
|
||||
|
||||
|
||||
@dataclass
|
||||
class Database(Step, ABC):
|
||||
name = "database"
|
||||
|
||||
name = "database"
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
Enrichers are modular components that enhance archived content by adding
|
||||
context, metadata, or additional processing.
|
||||
|
||||
These add additional information to the context, such as screenshots, hashes, and metadata.
|
||||
They are designed to work within the archiving pipeline, operating on `Metadata` objects after
|
||||
the archiving step and before storage or formatting.
|
||||
|
||||
Enrichers are optional but highly useful for making the archived data more powerful.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from auto_archiver.core import Metadata, Step
|
||||
|
||||
@dataclass
|
||||
class Enricher(Step, ABC):
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
|
||||
name = "enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Enricher:
|
||||
return Step.init(name, config, Enricher)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
|
@ -1,7 +1,7 @@
|
|||
""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework.
|
||||
This class provides common utility methods and a standard interface for archivers.
|
||||
""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
|
||||
This class provides common utility methods and a standard interface for extractors.
|
||||
|
||||
Factory method to initialize an archiver instance based on its name.
|
||||
Factory method to initialize an extractor instance based on its name.
|
||||
|
||||
|
||||
"""
|
||||
|
@ -15,32 +15,32 @@ import mimetypes, requests
|
|||
from loguru import logger
|
||||
from retrying import retry
|
||||
|
||||
from ..core import Metadata, Step, ArchivingContext
|
||||
from ..core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
@dataclass
|
||||
class Archiver:
|
||||
class Extractor:
|
||||
"""
|
||||
Base class for implementing archivers in the media archiving framework.
|
||||
Base class for implementing extractors in the media archiving framework.
|
||||
Subclasses must implement the `download` method to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
def setup(self) -> None:
|
||||
# used when archivers need to login or do other one-time setup
|
||||
# used when extractors need to login or do other one-time setup
|
||||
pass
|
||||
|
||||
def cleanup(self) -> None:
|
||||
# called when archivers are done, or upon errors, cleanup any resources
|
||||
# called when extractors are done, or upon errors, cleanup any resources
|
||||
pass
|
||||
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
return url
|
||||
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
"""
|
||||
Returns True if this archiver can handle the given URL
|
||||
|
||||
Returns True if this extractor can handle the given URL
|
||||
|
||||
Should be overridden by subclasses
|
||||
"""
|
||||
return True
|
||||
|
@ -84,10 +84,10 @@ class Archiver:
|
|||
for chunk in d.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
return to_filename
|
||||
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"Failed to fetch the Media URL: {e}")
|
||||
|
||||
@abstractmethod
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
pass
|
||||
pass
|
|
@ -1,8 +1,8 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Step
|
||||
|
||||
|
||||
@dataclass
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from ..core import Metadata, Media, Step
|
||||
from auto_archiver.core import Metadata, Media, Step
|
||||
|
||||
|
||||
@dataclass
|
|
@ -4,10 +4,10 @@ from dataclasses import dataclass
|
|||
from typing import IO, Optional
|
||||
import os
|
||||
|
||||
from ..utils.misc import random_str
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
from ..core import Media, Step, ArchivingContext, Metadata
|
||||
from ..enrichers import HashEnricher
|
||||
from auto_archiver.core import Media, Step, ArchivingContext, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
|
@ -8,9 +8,4 @@ from .context import ArchivingContext
|
|||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
# from .config import Config
|
||||
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
from .context import ArchivingContext
|
||||
from .metadata import Metadata
|
||||
# from .config import Config
|
|
@ -15,7 +15,7 @@ from .loader import MODULE_TYPES
|
|||
# configurable_parents = [
|
||||
# Feeder,
|
||||
# Enricher,
|
||||
# Archiver,
|
||||
# Extractor,
|
||||
# Database,
|
||||
# Storage,
|
||||
# Formatter
|
||||
|
@ -23,7 +23,7 @@ from .loader import MODULE_TYPES
|
|||
# ]
|
||||
# feeder: Feeder
|
||||
# formatter: Formatter
|
||||
# archivers: List[Archiver] = field(default_factory=[])
|
||||
# extractors: List[Extractor] = field(default_factory=[])
|
||||
# enrichers: List[Enricher] = field(default_factory=[])
|
||||
# storages: List[Storage] = field(default_factory=[])
|
||||
# databases: List[Database] = field(default_factory=[])
|
||||
|
|
|
@ -33,7 +33,7 @@ class ArchivingOrchestrator:
|
|||
# self.feeder: Feeder = config.feeder
|
||||
# self.formatter: Formatter = config.formatter
|
||||
# self.enrichers: List[Enricher] = config.enrichers
|
||||
# self.archivers: List[Archiver] = config.archivers
|
||||
# self.extractors: List[Extractor] = config.extractors
|
||||
# self.databases: List[Database] = config.databases
|
||||
# self.storages: List[Storage] = config.storages
|
||||
# ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||
|
@ -80,7 +80,7 @@ class ArchivingOrchestrator:
|
|||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
|
||||
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
|
||||
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'extractors', 'databases', 'storages', 'formatter'
|
||||
for module_type in MODULE_TYPES:
|
||||
if modules := getattr(basic_config, f"{module_type}s", []):
|
||||
enabled_modules.extend(modules)
|
||||
|
@ -98,7 +98,7 @@ class ArchivingOrchestrator:
|
|||
self.add_module_args(available_modules(with_manifest=True), parser)
|
||||
|
||||
|
||||
breakpoint()
|
||||
# breakpoint()
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
|
@ -165,7 +165,8 @@ class ArchivingOrchestrator:
|
|||
|
||||
for module_type in MODULE_TYPES:
|
||||
if module_type == 'enricher':
|
||||
breakpoint()
|
||||
pass
|
||||
# breakpoint()
|
||||
step_items = []
|
||||
modules_to_load = self.config['steps'][f"{module_type}s"]
|
||||
|
||||
|
@ -228,7 +229,7 @@ class ArchivingOrchestrator:
|
|||
def cleanup(self)->None:
|
||||
logger.info("Cleaning up")
|
||||
for e in self.config['steps']['extractors']:
|
||||
breakpoint()
|
||||
# breakpoint()
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
""" Databases are used to store the outputs from running the Autp Archiver.
|
||||
|
||||
|
||||
"""
|
||||
from .database import Database
|
|
@ -1,12 +0,0 @@
|
|||
"""
|
||||
Enrichers are modular components that enhance archived content by adding
|
||||
context, metadata, or additional processing.
|
||||
|
||||
These add additional information to the context, such as screenshots, hashes, and metadata.
|
||||
They are designed to work within the archiving pipeline, operating on `Metadata` objects after
|
||||
the archiving step and before storage or formatting.
|
||||
|
||||
Enrichers are optional but highly useful for making the archived data more powerful.
|
||||
|
||||
|
||||
"""
|
|
@ -1,22 +0,0 @@
|
|||
""" Base classes and utilities for enrichers in the Auto-Archiver system.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from ..core import Metadata, Step
|
||||
|
||||
@dataclass
|
||||
class Enricher(Step, ABC):
|
||||
name = "enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Enricher:
|
||||
return Step.init(name, config, Enricher)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
|
@ -1,3 +0,0 @@
|
|||
""" Feeders handle the input of media into the Auto Archiver.
|
||||
|
||||
"""
|
|
@ -1 +0,0 @@
|
|||
""" Formatters for the output of the content. """
|
|
@ -2,7 +2,7 @@ from typing import Union
|
|||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
{
|
||||
"name": "atlos_storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
# TODO also? get_atlos_config_options()
|
||||
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosStorage: A storage module for saving media files to the Atlos platform.
|
||||
|
||||
### Features
|
||||
- Uploads media files to Atlos using Atlos-specific APIs.
|
||||
- Automatically calculates SHA-256 hashes of media files for integrity verification.
|
||||
- Skips uploads for files that already exist on Atlos with the same hash.
|
||||
- Supports attaching metadata, such as `atlos_id`, to the uploaded files.
|
||||
- Provides CDN-like URLs for accessing uploaded media.
|
||||
|
||||
### Notes
|
||||
- Requires Atlos API configuration, including `atlos_url` and `api_token`.
|
||||
- Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
|
||||
"""
|
||||
}
|
|
@ -4,9 +4,9 @@ from loguru import logger
|
|||
import requests
|
||||
import hashlib
|
||||
|
||||
from ..core import Media, Metadata
|
||||
from ..storages import Storage
|
||||
from ..utils import get_atlos_config_options
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.base_modules import Storage
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosStorage(Storage):
|
|
@ -1,11 +1,12 @@
|
|||
import os
|
||||
|
||||
from typing import Union
|
||||
from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
import requests
|
||||
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
def get_atlos_config_options():
|
||||
return {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
from loguru import logger
|
||||
import requests
|
||||
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ from loguru import logger
|
|||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from loguru import logger
|
||||
import csv
|
||||
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import url_or_none
|
||||
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
m = {
|
||||
"name": "Google Drive Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"google-api-python-client",
|
||||
"google-auth",
|
||||
"google-auth-oauthlib",
|
||||
"google-auth-httplib2"
|
||||
],
|
||||
},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
|
||||
},
|
||||
"description": """
|
||||
GDriveStorage: A storage module for saving archived content to Google Drive.
|
||||
|
||||
### Features
|
||||
- Saves media files to Google Drive, organizing them into folders based on the provided path structure.
|
||||
- Supports OAuth token-based authentication or service account credentials for API access.
|
||||
- Automatically creates folders in Google Drive if they don't exist.
|
||||
- Retrieves CDN URLs for stored files, enabling easy sharing and access.
|
||||
|
||||
### Notes
|
||||
- Requires setup with either a Google OAuth token or a service account JSON file.
|
||||
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
|
||||
- Automatically handles Google Drive API token refreshes for long-running jobs.
|
||||
"""
|
||||
}
|
|
@ -9,8 +9,8 @@ from google.oauth2 import service_account
|
|||
from google.oauth2.credentials import Credentials
|
||||
from google.auth.transport.requests import Request
|
||||
|
||||
from ..core import Media
|
||||
from . import Storage
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.base_modules import Storage
|
||||
|
||||
|
||||
class GDriveStorage(Storage):
|
|
@ -1,17 +1,12 @@
|
|||
import os
|
||||
import mimetypes
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.context import ArchivingContext
|
||||
from auto_archiver.archivers.archiver import Archiver
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
class Bluesky(GenericDropin):
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
result = Metadata()
|
||||
result.set_url(url)
|
||||
result.set_title(post["record"]["text"])
|
||||
|
@ -42,7 +37,7 @@ class Bluesky(GenericDropin):
|
|||
|
||||
|
||||
|
||||
def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
|
||||
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
|
||||
"""
|
||||
Iterates over image(s) or video in a Bluesky post and downloads them
|
||||
"""
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from yt_dlp.extractor.common import InfoExtractor
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.archivers.archiver import Archiver
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
|
||||
class GenericDropin:
|
||||
"""Base class for dropins for the generic extractor.
|
||||
|
@ -30,7 +30,7 @@ class GenericDropin:
|
|||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
"""
|
||||
This method should create a Metadata object from the post data.
|
||||
"""
|
||||
|
|
|
@ -5,10 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.archivers.archiver import Archiver
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
|
||||
class GenericExtractor(Archiver):
|
||||
class GenericExtractor(Extractor):
|
||||
name = "youtubedl_archiver" #left as is for backwards compat
|
||||
_dropins = {}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Type
|
|||
|
||||
from auto_archiver.utils import traverse_obj
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.archivers.archiver import Archiver
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
@ -19,7 +19,7 @@ class Truth(GenericDropin):
|
|||
def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
|
||||
return True
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
"""
|
||||
Creates metadata from a truth social post
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from slugify import slugify
|
|||
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.archivers.archiver import Archiver
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
@ -32,7 +32,7 @@ class Twitter(GenericDropin):
|
|||
twid = ie_instance._match_valid_url(url).group('id')
|
||||
return ie_instance._extract_status(twid=twid)
|
||||
|
||||
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
|
||||
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
result = Metadata()
|
||||
try:
|
||||
if not tweet.get("user") or not tweet.get("created_at"):
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
# TODO merge with feeder manifest?
|
||||
{
|
||||
"name": "gsheet_db",
|
||||
"type": ["database"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {"python": [" loguru"],
|
||||
},
|
||||
"description": """
|
||||
Handles integration with Google Sheets for tracking archival tasks.
|
||||
|
||||
### Features
|
||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Notes
|
||||
- Currently works only with metadata provided by GsheetFeeder.
|
||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
||||
""",
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"name": "Google Sheets Feeder",
|
||||
"name": "Google Sheets Procesor",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
|
@ -22,7 +22,12 @@
|
|||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
Google Sheets Module.
|
||||
|
||||
Handles feeding from a google sheet as well as an optional write back to the sheet.
|
||||
|
||||
## GsheetsFeeder
|
||||
A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
@ -36,5 +41,18 @@
|
|||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
|
||||
## GsheetsDatabase:
|
||||
Handles integration with Google Sheets for tracking archival tasks.
|
||||
|
||||
### Features
|
||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Notes
|
||||
- Currently works only with metadata provided by GsheetFeeder.
|
||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
||||
"""
|
||||
}
|
|
@ -1,10 +1,11 @@
|
|||
from typing import Union, Tuple
|
||||
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import GWorksheet
|
||||
|
|
@ -13,8 +13,7 @@ import gspread, os
|
|||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
# from . import Enricher
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import Gsheets, GWorksheet
|
||||
|
|
@ -0,0 +1 @@
|
|||
from hash_enricher import HashEnricher
|
|
@ -7,7 +7,7 @@
|
|||
},
|
||||
"configs": {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
"chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
},
|
||||
"description": """
|
||||
Generates cryptographic hashes for media files to ensure data integrity and authenticity.
|
||||
|
|
|
@ -10,7 +10,7 @@ making it suitable for handling large files efficiently.
|
|||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
|
@ -40,7 +40,11 @@ class HashEnricher(Enricher):
|
|||
else:
|
||||
self.chunksize = self.configs()["chunksize"]["default"]
|
||||
|
||||
self.chunksize = int(self.chunksize)
|
||||
try:
|
||||
self.chunksize = int(self.chunksize)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.")
|
||||
|
||||
assert self.chunksize >= -1, "read length must be non-negative or -1"
|
||||
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
m = {
|
||||
"name": "HTML Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "jinja2"],
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
},
|
||||
"description": """ """,
|
||||
}
|
|
@ -7,11 +7,11 @@ from loguru import logger
|
|||
import json
|
||||
import base64
|
||||
|
||||
from ..version import __version__
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from . import Formatter
|
||||
from ..enrichers import HashEnricher
|
||||
from ..utils.misc import random_str
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.base_modules import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -28,11 +28,11 @@ class HtmlFormatter(Formatter):
|
|||
})
|
||||
self.template = self.environment.get_template("html_template.html")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
}
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
# }
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"name": "Instagram API Archiver",
|
||||
"name": "Instagram API Extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "instagram_api_archiver:InstagramApiArchiver",
|
||||
"external_dependencies":
|
||||
{"python": ["requests",
|
||||
"loguru",
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
The `instagram_api_archiver` module provides tools for archiving various types of Instagram content
|
||||
The `instagram_api_extractor` module provides tools for archiving various types of Instagram content
|
||||
using the [Instagrapi API](https://github.com/subzeroid/instagrapi).
|
||||
|
||||
Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles,
|
||||
|
@ -16,19 +16,19 @@ from loguru import logger
|
|||
from retrying import retry
|
||||
from tqdm import tqdm
|
||||
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class InstagramAPIArchiver(Archiver):
|
||||
class InstagramAPIExtractor(Extractor):
|
||||
"""
|
||||
Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data
|
||||
|
||||
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
|
||||
"""
|
||||
|
||||
name = "instagram_api_archiver"
|
||||
name = "instagram_api_extractor"
|
||||
|
||||
global_pattern = re.compile(
|
||||
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"name": "Instagram Archiver",
|
||||
"name": "Instagram Extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "instagram_archiver:InstagramArchiver",
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
"instaloader",
|
|
@ -7,15 +7,15 @@ import re, os, shutil, traceback
|
|||
import instaloader # https://instaloader.github.io/as-module.html
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Media
|
||||
|
||||
class InstagramArchiver(Archiver):
|
||||
class InstagramExtractor(Extractor):
|
||||
"""
|
||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
"""
|
||||
name = "instagram_archiver"
|
||||
name = "instagram_extractor"
|
||||
|
||||
# NB: post regex should be tested before profile
|
||||
# https://regex101.com/r/MGPquX/1
|
||||
|
@ -67,7 +67,7 @@ class InstagramArchiver(Archiver):
|
|||
elif len(profile_matches):
|
||||
result = self.download_profile(url, profile_matches[0])
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download with instagram archiver due to: {e}, make sure your account credentials are valid.")
|
||||
logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.")
|
||||
finally:
|
||||
shutil.rmtree(self.download_folder, ignore_errors=True)
|
||||
return result
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"name": "Instagram Telegram Bot Archiver",
|
||||
"name": "Instagram Telegram Bot Extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "instagram_tbot_archiver:InstagramTbotArchiver",
|
||||
"external_dependencies": {"python": ["loguru",
|
||||
"telethon",],
|
||||
},
|
||||
|
@ -13,7 +12,7 @@
|
|||
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||
},
|
||||
"description": """
|
||||
The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
|
||||
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
|
||||
such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs
|
||||
to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and
|
||||
returned as part of a `Metadata` object.
|
||||
|
@ -26,7 +25,7 @@ returned as part of a `Metadata` object.
|
|||
|
||||
### Setup
|
||||
|
||||
To use the `InstagramTbotArchiver`, you need to provide the following configuration settings:
|
||||
To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
|
||||
- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
|
||||
- **Session File**: Optional path to store the Telegram session file for future use.
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
InstagramTbotArchiver Module
|
||||
InstagramTbotExtractor Module
|
||||
|
||||
This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`).
|
||||
It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the
|
||||
|
@ -15,18 +15,18 @@ from sqlite3 import OperationalError
|
|||
from loguru import logger
|
||||
from telethon.sync import TelegramClient
|
||||
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class InstagramTbotArchiver(Archiver):
|
||||
class InstagramTbotExtractor(Extractor):
|
||||
"""
|
||||
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
|
||||
https://github.com/adw0rd/instagrapi
|
||||
https://t.me/instagram_load_bot
|
||||
"""
|
||||
name = "instagram_tbot_archiver"
|
||||
name = "instagram_tbot_extractor"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
@ -49,7 +49,7 @@ class InstagramTbotArchiver(Archiver):
|
|||
try:
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
except OperationalError as e:
|
||||
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||
|
||||
with self.client.start():
|
||||
logger.success(f"SETUP {self.name} login works.")
|
|
@ -0,0 +1,26 @@
|
|||
m = {
|
||||
"name": "Local Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
},
|
||||
"description": """
|
||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||
|
||||
### Features
|
||||
- Saves archived media files to a specified folder on the local filesystem.
|
||||
- Maintains file metadata during storage using `shutil.copy2`.
|
||||
- Supports both absolute and relative paths for stored files, configurable via `save_absolute`.
|
||||
- Automatically creates directories as needed for storing files.
|
||||
|
||||
### Notes
|
||||
- Default storage folder is `./archived`, but this can be changed via the `save_to` configuration.
|
||||
- The `save_absolute` option can reveal the file structure in output formats; use with caution.
|
||||
"""
|
||||
}
|
|
@ -4,8 +4,8 @@ from typing import IO
|
|||
import os
|
||||
from loguru import logger
|
||||
|
||||
from ..core import Media
|
||||
from ..storages import Storage
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.base_modules import Storage
|
||||
|
||||
|
||||
class LocalStorage(Storage):
|
|
@ -2,7 +2,7 @@ import datetime
|
|||
import os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ import subprocess
|
|||
import traceback
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
m = {
|
||||
"name": "Mute Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
},
|
||||
"description": """ Default formatter.
|
||||
""",
|
||||
}
|
|
@ -16,7 +16,7 @@ import numpy as np
|
|||
from PIL import Image, UnidentifiedImageError
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
m = {
|
||||
"name": "S3 Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["boto3", "loguru"],
|
||||
},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
},
|
||||
"description": """
|
||||
S3Storage: A storage module for saving media files to an S3-compatible object storage.
|
||||
|
||||
### Features
|
||||
- Uploads media files to an S3 bucket with customizable configurations.
|
||||
- Supports `random_no_duplicate` mode to avoid duplicate uploads by checking existing files based on SHA-256 hashes.
|
||||
- Automatically generates unique paths for files when duplicates are found.
|
||||
- Configurable endpoint and CDN URL for different S3-compatible providers.
|
||||
- Supports both private and public file storage, with public files being readable online.
|
||||
|
||||
### Notes
|
||||
- Requires S3 credentials (API key and secret) and a bucket name to function.
|
||||
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
|
||||
- Uses `boto3` for interaction with the S3 API.
|
||||
"""
|
||||
}
|
|
@ -2,10 +2,11 @@
|
|||
from typing import IO
|
||||
import boto3, os
|
||||
|
||||
from ..utils.misc import random_str
|
||||
from ..core import Media
|
||||
from ..storages import Storage
|
||||
from ..enrichers import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.base_modules import Storage
|
||||
# TODO
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from loguru import logger
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
|
@ -5,7 +5,7 @@ import base64
|
|||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.utils import Webdriver, UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ from slugify import slugify
|
|||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"name": "Telegram Archiver",
|
||||
"name": "Telegram Extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "telegram_archiver:TelegramArchiver",
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
|
@ -11,7 +10,7 @@
|
|||
],
|
||||
},
|
||||
"description": """
|
||||
The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials.
|
||||
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
|
||||
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
|
||||
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
|
||||
is advised for more comprehensive functionality.
|
|
@ -2,16 +2,16 @@ import requests, re, html
|
|||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class TelegramArchiver(Archiver):
|
||||
class TelegramExtractor(Extractor):
|
||||
"""
|
||||
Archiver for telegram that does not require login, but the telethon_archiver is much more advised,
|
||||
Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
|
||||
will only return if at least one image or one video is found
|
||||
"""
|
||||
name = "telegram_archiver"
|
||||
name = "telegram_extractor"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
|
@ -1,8 +1,7 @@
|
|||
# TODO rm dependency on json
|
||||
{
|
||||
"name": "telethon_archiver",
|
||||
"name": "telethon_extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "telethon_archiver:TelethonArchiver",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["telethon",
|
||||
|
@ -25,7 +24,7 @@
|
|||
}
|
||||
},
|
||||
"description": """
|
||||
The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups.
|
||||
The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups.
|
||||
It supports private and public channels, downloading grouped posts with media, and can join channels using invite links
|
||||
if provided in the configuration.
|
||||
|
||||
|
@ -37,7 +36,7 @@ if provided in the configuration.
|
|||
- Outputs structured metadata and media using `Metadata` and `Media` objects.
|
||||
|
||||
### Setup
|
||||
To use the `TelethonArchiver`, you must configure the following:
|
||||
To use the `TelethonExtractor`, you must configure the following:
|
||||
- **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps).
|
||||
- **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`).
|
||||
- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
|
|
@ -8,13 +8,13 @@ from loguru import logger
|
|||
from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class TelethonArchiver(Archiver):
|
||||
name = "telethon_archiver"
|
||||
class TelethonArchiver(Extractor):
|
||||
name = "telethon_extractor"
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
|
@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
|
|||
import ffmpeg, os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
|
|
@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
|
|||
from asn1crypto import pem
|
||||
import certifi
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"name": "Twitter API Archiver",
|
||||
"name": "Twitter API Extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "twitter_api_archiver:TwitterApiArchiver",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["requests",
|
||||
|
@ -20,7 +19,7 @@
|
|||
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
||||
},
|
||||
"description": """
|
||||
The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API.
|
||||
The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API.
|
||||
It supports multiple API configurations for extended rate limits and reliable access.
|
||||
Features include URL expansion, media downloads (e.g., images, videos), and structured output
|
||||
via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens
|
||||
|
@ -34,7 +33,7 @@
|
|||
- Outputs structured metadata and media using `Metadata` and `Media` objects.
|
||||
|
||||
### Setup
|
||||
To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration:
|
||||
To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration:
|
||||
- **Bearer Token(s)**: A single token or a list for rate-limited API access.
|
||||
- **Consumer Key and Secret**: Required for user-authenticated API access.
|
||||
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
|
|
@ -8,11 +8,11 @@ from loguru import logger
|
|||
from pytwitter import Api
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata,Media
|
||||
|
||||
class TwitterApiArchiver(Archiver):
|
||||
name = "twitter_api_archiver"
|
||||
class TwitterApiExtractor(Extractor):
|
||||
name = "twitter_api_extractor"
|
||||
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"name": "VKontakte Archiver",
|
||||
"name": "VKontakte Extractor",
|
||||
"type": ["extractor"],
|
||||
"entry_point": "vk_archiver:VKArchiver",
|
||||
"requires_setup": True,
|
||||
"depends": ["core", "utils"],
|
||||
"external_dependencies": {
|
||||
|
@ -14,7 +13,7 @@
|
|||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||
},
|
||||
"description": """
|
||||
The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages.
|
||||
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
|
||||
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
|
||||
and download content. Note that VK videos are handled separately by the `YTDownloader`.
|
||||
|
|
@ -2,16 +2,16 @@ from loguru import logger
|
|||
from vk_url_scraper import VkScraper
|
||||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class VkArchiver(Archiver):
|
||||
class VkExtractor(Extractor):
|
||||
""""
|
||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||
Currently only works for /wall posts
|
||||
"""
|
||||
name = "vk_archiver"
|
||||
name = "vk_extractor"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
|
@ -6,12 +6,11 @@ from loguru import logger
|
|||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor, Enricher
|
||||
from auto_archiver.utils import UrlUtil, random_str
|
||||
|
||||
|
||||
class WaczArchiverEnricher(Enricher, Archiver):
|
||||
class WaczExtractorEnricher(Enricher, Extractor):
|
||||
"""
|
||||
Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL
|
||||
If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
|
||||
|
|
|
@ -2,12 +2,11 @@ import json
|
|||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.base_modules import Extractor, Enricher
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
class WaybackExtractorEnricher(Enricher, Extractor):
|
||||
"""
|
||||
Submits the current URL to the webarchive and returns a job_id or completed archive.
|
||||
|
||||
|
|
|
@ -2,9 +2,9 @@ import traceback
|
|||
import requests, time
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.storages import S3Storage
|
||||
from auto_archiver.modules import S3Storage
|
||||
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
""" This module contains the storage classes for the auto-archiver.
|
||||
|
||||
"""
|
|
@ -1,9 +1,7 @@
|
|||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Step
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.archivers.archiver import Archiver
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
class TestArchiverBase(object):
|
||||
|
||||
archiver_class: str = None
|
||||
|
@ -13,7 +11,7 @@ class TestArchiverBase(object):
|
|||
def setup_archiver(self):
|
||||
assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
self.archiver: Archiver = self.archiver_class({self.archiver_class.name: self.config})
|
||||
self.archiver: Extractor = self.archiver_class({self.archiver_class.name: self.config})
|
||||
|
||||
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
|
||||
assert test_response is not False
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from auto_archiver.core.context import ArchivingContext
|
||||
from auto_archiver.formatters.html_formatter import HtmlFormatter
|
||||
from auto_archiver.modules.html_formatter import HtmlFormatter
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue