kopia lustrzana https://github.com/bellingcat/auto-archiver
Update manifests and modules
rodzic
ba4b330881
commit
aa7ca93a43
|
@ -0,0 +1 @@
|
|||
from api_db import AAApiDb
|
|
@ -15,7 +15,9 @@
|
|||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL",
|
||||
"type": lambda val: set(val.split(",")),
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
Provides integration with the Auto-Archiver API for querying and storing archival data.
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Union
|
|||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.base_processors import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .atlos import AtlosStorage
|
|
@ -15,12 +15,12 @@
|
|||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val,
|
||||
"type": str,
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val,
|
||||
"type": str,
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -5,7 +5,7 @@ import requests
|
|||
import hashlib
|
||||
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.base_modules import Storage
|
||||
from auto_archiver.base_processors import Storage
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from atlos_db import AtlosDb
|
|
@ -11,12 +11,11 @@
|
|||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
"type": str
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -6,7 +6,7 @@ from csv import DictWriter
|
|||
from dataclasses import asdict
|
||||
import requests
|
||||
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.base_processors import Database
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
|
|
@ -3,11 +3,11 @@ def get_atlos_config_options():
|
|||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
"type": str
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
"type": str
|
||||
},
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
from .atlos_feeder import AtlosFeeder
|
|
@ -9,12 +9,12 @@
|
|||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
"type": str
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
"type": str
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from loguru import logger
|
||||
import requests
|
||||
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.base_processors import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .cli_feeder import CLIFeeder
|
|
@ -9,7 +9,7 @@
|
|||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
"type": lambda val: set(val.split(",")),
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.base_processors import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .console_db import ConsoleDb
|
|
@ -1,6 +1,6 @@
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.base_processors import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .csv_db import CSVDb
|
|
@ -3,7 +3,7 @@ from loguru import logger
|
|||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.base_processors import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .csv_feeder import CSVFeeder
|
|
@ -11,7 +11,7 @@
|
|||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
"type": lambda val: set(val.split(",")),
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from loguru import logger
|
||||
import csv
|
||||
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.base_processors import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import url_or_none
|
||||
|
||||
|
@ -17,7 +17,7 @@ class CSVFeeder(Feeder):
|
|||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
"type": lambda val: set(val.split(",")),
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .gdrive_storage import GDriveStorage
|
|
@ -1,4 +1,4 @@
|
|||
m = {
|
||||
{
|
||||
"name": "Google Drive Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
|
@ -12,15 +12,16 @@ m = {
|
|||
],
|
||||
},
|
||||
"configs": {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
},
|
||||
# TODO: get base storage configs
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
"choices": ["flat", "url", "random"],
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
|
||||
|
|
|
@ -10,7 +10,7 @@ from google.oauth2.credentials import Credentials
|
|||
from google.auth.transport.requests import Request
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.base_modules import Storage
|
||||
from auto_archiver.base_processors import Storage
|
||||
|
||||
|
||||
class GDriveStorage(Storage):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from yt_dlp.extractor.common import InfoExtractor
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
|
||||
class GenericDropin:
|
||||
"""Base class for dropins for the generic extractor.
|
||||
|
|
|
@ -5,7 +5,7 @@ from yt_dlp.extractor.common import InfoExtractor
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
|
||||
class GenericExtractor(Extractor):
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Type
|
|||
|
||||
from auto_archiver.utils import traverse_obj
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
|
|
@ -6,7 +6,7 @@ from slugify import slugify
|
|||
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .gsheet_db import GsheetsDb
|
|
@ -0,0 +1,38 @@
|
|||
{
|
||||
"name": "Google Sheets Database",
|
||||
"type": ["database"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"type": lambda val: set(val.split(",")),
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"type": lambda val: set(val.split(",")),
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsDatabase:
|
||||
Handles integration with Google Sheets for tracking archival tasks.
|
||||
|
||||
### Features
|
||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Notes
|
||||
- Currently works only with metadata provided by GsheetFeeder.
|
||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
||||
"""
|
||||
}
|
|
@ -5,7 +5,7 @@ from urllib.parse import quote
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Database
|
||||
from auto_archiver.base_processors import Database
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import GWorksheet
|
||||
|
||||
|
@ -105,5 +105,4 @@ class GsheetsDb(Database):
|
|||
elif self.sheet_id:
|
||||
print(self.sheet_id)
|
||||
|
||||
|
||||
return gw, row
|
|
@ -0,0 +1 @@
|
|||
from .gsheet_feeder import GsheetsFeeder
|
|
@ -9,12 +9,12 @@
|
|||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
"type": lambda val: set(val.split(",")),
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
"type": lambda val: set(val.split(",")),
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
|
@ -22,11 +22,7 @@
|
|||
}
|
||||
},
|
||||
"description": """
|
||||
Google Sheets Module.
|
||||
|
||||
Handles feeding from a google sheet as well as an optional write back to the sheet.
|
||||
|
||||
## GsheetsFeeder
|
||||
GsheetsFeeder
|
||||
A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
|
@ -41,18 +37,5 @@
|
|||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
|
||||
## GsheetsDatabase:
|
||||
Handles integration with Google Sheets for tracking archival tasks.
|
||||
|
||||
### Features
|
||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Notes
|
||||
- Currently works only with metadata provided by GsheetFeeder.
|
||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
||||
"""
|
||||
}
|
|
@ -13,7 +13,7 @@ import gspread, os
|
|||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.base_modules import Feeder
|
||||
from auto_archiver.base_processors import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import Gsheets, GWorksheet
|
||||
|
|
@ -1 +1 @@
|
|||
from hash_enricher import HashEnricher
|
||||
from .hash_enricher import HashEnricher
|
|
@ -10,7 +10,7 @@ making it suitable for handling large files efficiently.
|
|||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .html_formatter import HtmlFormatter
|
|
@ -1,4 +1,4 @@
|
|||
m = {
|
||||
{
|
||||
"name": "HTML Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": False,
|
||||
|
|
|
@ -9,7 +9,7 @@ import base64
|
|||
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.base_modules import Formatter
|
||||
from auto_archiver.base_processors import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .instagram_api_archiver import InstagramAPIExtractor
|
|
@ -16,7 +16,7 @@ from loguru import logger
|
|||
from retrying import retry
|
||||
from tqdm import tqdm
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .instagram_archiver import InstagramExtractor
|
|
@ -7,7 +7,7 @@ import re, os, shutil, traceback
|
|||
import instaloader # https://instaloader.github.io/as-module.html
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Media
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .instagram_tbot_archiver import InstagramTbotExtractor
|
|
@ -15,7 +15,7 @@ from sqlite3 import OperationalError
|
|||
from loguru import logger
|
||||
from telethon.sync import TelegramClient
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .local import LocalStorage
|
|
@ -1,4 +1,4 @@
|
|||
m = {
|
||||
{
|
||||
"name": "Local Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": False,
|
||||
|
@ -9,10 +9,12 @@ m = {
|
|||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
"choices": ["flat", "url", "random"],
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
|
|
|
@ -5,7 +5,7 @@ import os
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.base_modules import Storage
|
||||
from auto_archiver.base_processors import Storage
|
||||
|
||||
|
||||
class LocalStorage(Storage):
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .meta_enricher import MetaEnricher
|
|
@ -2,7 +2,7 @@ import datetime
|
|||
import os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .metadata_enricher import MetadataEnricher
|
|
@ -2,7 +2,7 @@ import subprocess
|
|||
import traceback
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .mute_formatter import MuteFormatter
|
|
@ -0,0 +1 @@
|
|||
from .pdq_hash_enricher import PdqHashEnricher
|
|
@ -16,7 +16,7 @@ import numpy as np
|
|||
from PIL import Image, UnidentifiedImageError
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .s3 import S3Storage
|
|
@ -1,4 +1,4 @@
|
|||
m = {
|
||||
{
|
||||
"name": "S3 Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
|
@ -6,29 +6,31 @@ m = {
|
|||
"python": ["boto3", "loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
},
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
},
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
"choices": ["flat", "url", "random"],
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
},
|
||||
"description": """
|
||||
S3Storage: A storage module for saving media files to an S3-compatible object storage.
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import boto3, os
|
|||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.base_modules import Storage
|
||||
from auto_archiver.base_processors import Storage
|
||||
# TODO
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from loguru import logger
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .screenshot_enricher import ScreenshotEnricher
|
|
@ -5,7 +5,7 @@ import base64
|
|||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.utils import Webdriver, UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .ssl_enricher import SSLEnricher
|
|
@ -3,7 +3,7 @@ from slugify import slugify
|
|||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .telegram_extractor import TelegramExtractor
|
|
@ -2,7 +2,7 @@ import requests, re, html
|
|||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .telethon_archiver import TelethonArchiver
|
|
@ -1,4 +1,4 @@
|
|||
# TODO rm dependency on json
|
||||
import json
|
||||
{
|
||||
"name": "telethon_extractor",
|
||||
"type": ["extractor"],
|
||||
|
@ -19,8 +19,7 @@
|
|||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
# TODO
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
"type": lambda x: json.loads(x),
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -8,7 +8,7 @@ from loguru import logger
|
|||
from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .thumbnail_enricher import ThumbnailEnricher
|
|
@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
|
|||
import ffmpeg, os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .timestamping_enricher import TimestampingEnricher
|
|
@ -21,7 +21,7 @@
|
|||
"http://tss.accv.es:8318/tsa"
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
"type": lambda val: set(val.split(",")),
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
|
|||
from asn1crypto import pem
|
||||
import certifi
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .twitter_api_archiver import TwitterApiExtractor
|
|
@ -12,7 +12,7 @@
|
|||
"configs": {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
|
||||
"type": lambda val: set(val.split(",")),},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
|
|
|
@ -8,7 +8,7 @@ from loguru import logger
|
|||
from pytwitter import Api
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
from auto_archiver.core import Metadata,Media
|
||||
|
||||
class TwitterApiExtractor(Extractor):
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .vk_archiver import VkExtractor
|
|
@ -2,7 +2,7 @@ from loguru import logger
|
|||
from vk_url_scraper import VkScraper
|
||||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.base_processors import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .wacz_enricher import WaczExtractorEnricher
|
|
@ -6,7 +6,7 @@ from loguru import logger
|
|||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.base_modules import Extractor, Enricher
|
||||
from auto_archiver.base_processors import Extractor, Enricher
|
||||
from auto_archiver.utils import UrlUtil, random_str
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .wayback_enricher import WaybackExtractorEnricher
|
|
@ -2,7 +2,7 @@ import json
|
|||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
from auto_archiver.base_modules import Extractor, Enricher
|
||||
from auto_archiver.base_processors import Extractor, Enricher
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .whisper_enricher import WhisperEnricher
|
|
@ -2,9 +2,9 @@ import traceback
|
|||
import requests, time
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Enricher
|
||||
from auto_archiver.base_processors import Enricher
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.modules import S3Storage
|
||||
from auto_archiver.modules.s3_storage import S3Storage
|
||||
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.base_modules.extractor import Extractor
|
||||
from auto_archiver.base_processors.extractor import Extractor
|
||||
class TestArchiverBase(object):
|
||||
|
||||
archiver_class: str = None
|
||||
|
|
Ładowanie…
Reference in New Issue