Merge branch 'main' into timestamping_rewrite

pull/224/head
Patrick Robertson 2025-02-25 17:10:55 +00:00
commit 4dcb77c29f
48 zmienionych plików z 889 dodań i 299 usunięć

Wyświetl plik

@ -23,11 +23,13 @@ Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.
## Installation
View the [Installation Guide](installation/installation.md) for full instructions
View the [Installation Guide](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html) for full instructions
**Advanced:**
To get started quickly using Docker:
`docker pull bellingcat/auto-archiver && docker run`
`docker pull bellingcat/auto-archiver && docker run --rm -v secrets:/app/secrets bellingcat/auto-archiver --config secrets/orchestration.yaml`
Or pip:

Wyświetl plik

@ -3,6 +3,7 @@ from pathlib import Path
from auto_archiver.core.module import ModuleFactory
from auto_archiver.core.base_module import BaseModule
from ruamel.yaml import YAML
from ruamel.yaml.comments import CommentedMap
import io
MODULES_FOLDER = Path(__file__).parent.parent.parent.parent / "src" / "auto_archiver" / "modules"
@ -30,6 +31,7 @@ steps:
...
{config_string}
"""
def generate_module_docs():
@ -38,8 +40,9 @@ def generate_module_docs():
modules_by_type = {}
header_row = "| " + " | ".join(TABLE_HEADER) + "|\n" + "| --- " * len(TABLE_HEADER) + "|\n"
configs_cheatsheet = "\n## Configuration Options\n"
configs_cheatsheet += header_row
global_table = "\n## Configuration Options\n" + header_row
global_yaml = yaml.load("""\n# Module configuration\nplaceholder: {}""")
for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
# generate the markdown file from the __manifest__.py file.
@ -66,19 +69,30 @@ def generate_module_docs():
config_table = header_row
config_yaml = {}
global_yaml[module.name] = CommentedMap()
global_yaml.yaml_set_comment_before_after_key(module.name, f"\n\n{module.display_name} configuration options")
for key, value in manifest['configs'].items():
type = value.get('type', 'string')
if type == 'auto_archiver.utils.json_loader':
if type == 'json_loader':
value['type'] = 'json'
elif type == 'str':
type = "string"
default = value.get('default', '')
config_yaml[key] = default
global_yaml[module.name][key] = default
if value.get('help', ''):
global_yaml[module.name].yaml_add_eol_comment(value.get('help', ''), key)
help = "**Required**. " if value.get('required', False) else "Optional. "
help += value.get('help', '')
config_table += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n"
configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
global_table += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
readme_str += "\n## Configuration Options\n"
readme_str += "\n### YAML\n"
@ -103,8 +117,13 @@ def generate_module_docs():
f.write(readme_str)
generate_index(modules_by_type)
del global_yaml['placeholder']
global_string = io.BytesIO()
global_yaml = yaml.dump(global_yaml, global_string)
global_string = global_string.getvalue().decode('utf-8')
global_yaml = f"```yaml\n{global_string}\n```"
with open(SAVE_FOLDER / "configs_cheatsheet.md", "w") as f:
f.write(configs_cheatsheet)
f.write("### Configuration File\n" + global_yaml + "\n### Command Line\n" + global_table)
def generate_index(modules_by_type):

BIN
docs/source/bc.png 100644

Plik binarny nie jest wyświetlany.

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 42 KiB

Wyświetl plik

@ -3,9 +3,11 @@
import sys
import os
from importlib.metadata import metadata
from datetime import datetime
sys.path.append(os.path.abspath('../scripts'))
from scripts import generate_module_docs
from auto_archiver.version import __version__
# -- Project Hooks -----------------------------------------------------------
# convert the module __manifest__.py files into markdown files
@ -15,7 +17,8 @@ generate_module_docs()
# -- Project information -----------------------------------------------------
package_metadata = metadata("auto-archiver")
project = package_metadata["name"]
authors = "Bellingcat"
copyright = str(datetime.now().year)
author = "Bellingcat"
release = package_metadata["version"]
language = 'en'
@ -32,7 +35,7 @@ extensions = [
]
templates_path = ['_templates']
exclude_patterns = []
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ""]
# -- AutoAPI Configuration ---------------------------------------------------
@ -76,6 +79,14 @@ source_suffix = {
html_theme = 'sphinx_book_theme'
html_static_path = ["../_static"]
html_css_files = ["custom.css"]
html_title = f"Auto Archiver v{__version__}"
html_logo = "bc.png"
html_theme_options = {
"repository_url": "https://github.com/bellingcat/auto-archiver",
"use_repository_button": True,
}
copybutton_prompt_text = r">>> |\.\.\."
copybutton_prompt_is_regexp = True

Wyświetl plik

@ -1,8 +1,8 @@
# Module Documentation
These pages describe the core modules that come with `auto-archiver` and provide the main functionality for archiving websites on the internet. There are five core module types:
These pages describe the core modules that come with Auto Archiver and provide the main functionality for archiving websites on the internet. There are five core module types:
1. Feeders - these 'feed' information (the URLs) from various sources to the `auto-archiver` for processing
1. Feeders - these 'feed' information (the URLs) from various sources to the Auto Archiver for processing
2. Extractors - these 'extract' the page data for a given URL that is fed in by a feeder
3. Enrichers - these 'enrich' the data extracted in the previous step with additional information
4. Storage - these 'store' the data in a persistent location (on disk, Google Drive etc.)

Wyświetl plik

@ -1,6 +1,6 @@
# Creating Your Own Modules
Modules are what's used to extend `auto-archiver` to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [Core Modules](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include:
Modules are what's used to extend Auto Archiver to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [Core Modules](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include:
1. Extracting data from a website which doesn't work with the current core extractors.
2. Enriching or altering the data before saving with additional information that the core enrichers do not offer.
@ -21,7 +21,7 @@ When done, you should have a module structure as follows:
│ └── awesome_extractor.py
```
Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the `auto-archiver` repository for examples of the folder structure for real-world modules.
Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the Auto Archiver repository for examples of the folder structure for real-world modules.
## Populating the Manifest File

Wyświetl plik

@ -6,7 +6,7 @@
1. Update the version number in [version.py](src/auto_archiver/version.py)
2. Go to github releases > new release > use `vx.y.z` for matching version notation
1. package is automatically updated in pypi
2. docker image is automatically pushed to dockerhup
2. docker image is automatically pushed to dockerhub

Wyświetl plik

@ -1,49 +1,6 @@
# How-To Guides
## How to use Google Sheets to load and store archive information
The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` - see the [Gsheet Feeder Docs](modules/autogen/feeder/gsheet_feeder.md) for more info. The default names of these columns and their purpose is:
Inputs:
* **Link** *(required)*: the URL of the post to archive
* **Destination folder**: custom folder for archived file (regardless of storage)
Outputs:
* **Archive status** *(required)*: Status of archive operation
* **Archive location**: URL of archived post
* **Archive date**: Date archived
* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet
* **Timestamp**: Timestamp of original post
* **Title**: Post title
* **Text**: Post text
* **Screenshot**: Link to screenshot of post
* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
* **WACZ**: Link to a WACZ web archive of post
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.)
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png)
Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation.
When the auto archiver starts running, it updates the "Archive status" column.
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png)
The links are downloaded and archived, and the spreadsheet is updated to the following:
![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png)
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive.
![The archive result for a link in the demo sheet.](../demo-archive.png)
The follow pages contain helpful how-to guides for common use cases of the Auto Archiver.
---
```{toctree}
@ -51,4 +8,5 @@ The "archive location" link contains the path of the archived file, in local sto
:glob:
how_to/*
```

Wyświetl plik

@ -0,0 +1,110 @@
# Logging in to sites
This how-to guide shows you how you can use various authentication methods to allow you to login to a site you are trying to archive. This is useful for websites that require a user to be logged in to browse them, or for sites that restrict bots.
In this How-To, we will authenticate on use Twitter/X.com using cookies, and on XXXX using username/password.
## Using cookies to authenticate on Twitter/X
It can be useful to archive tweets after logging in, since some tweets are only visible to authenticated users. One case is Tweets marked as 'Sensitive'.
Take this tweet as an example: [https://x.com/SozinhoRamalho/status/1876710769913450647](https://x.com/SozinhoRamalho/status/1876710769913450647)
This tweet has been marked as sensitive, so a normal run of Auto Archiver without a logged in session will fail to extract the tweet:
```{code-block} console
:emphasize-lines: 3,4,5,6
>>> auto-archiver https://x.com/SozinhoRamalho/status/1876710769913450647 ✭ ✱
...
ERROR: [twitter] 1876710769913450647: NSFW tweet requires authentication. Use --cookies,
--cookies-from-browser, --username and --password, --netrc-cmd, or --netrc (twitter) to
provide account credentials. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
for how to manually pass cookies
[twitter] 1876710769913450647: Downloading guest token
[twitter] 1876710769913450647: Downloading GraphQL JSON
2025-02-20 15:06:13.362 | ERROR | auto_archiver.modules.generic_extractor.generic_extractor:download_for_extractor:248 - Error downloading metadata for post: NSFW tweet requires authentication. Use --cookies, --cookies-from-browser, --username and --password, --netrc-cmd, or --netrc (twitter) to provide account credentials. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies
[generic] Extracting URL: https://x.com/SozinhoRamalho/status/1876710769913450647
[generic] 1876710769913450647: Downloading webpage
WARNING: [generic] Falling back on generic information extractor
[generic] 1876710769913450647: Extracting information
ERROR: Unsupported URL: https://x.com/SozinhoRamalho/status/1876710769913450647
2025-02-20 15:06:13.744 | INFO | auto_archiver.core.orchestrator:archive:483 - Trying extractor telegram_extractor for https://x.com/SozinhoRamalho/status/1876710769913450647
2025-02-20 15:06:13.744 | SUCCESS | auto_archiver.modules.console_db.console_db:done:23 - DONE Metadata(status='nothing archived', metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 6, 12, 473979, tzinfo=datetime.timezone.utc), 'url': 'https://x.com/SozinhoRamalho/status/1876710769913450647'}, media=[])
...
```
To get round this limitation, we can use **cookies** (information about a logged in user) to mimic being logged in to Twitter. There are two ways to pass cookies to Auto Archiver. One is from a file, and the other is from a browser profile on your computer.
In this tutorial, we will export the Twitter cookies from our browser and add them to Auto Archiver
**1. Installing a cookie exporter extension**
First, we need to install an extension in our browser to export the cookies for a certain site. The [FAQ on yt-dlp](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp) provides some suggestions: Get [cookies.txt LOCALLY](https://chromewebstore.google.com/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) for Chrome or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) for Firefox.
**2. Export the cookies**
```{note} See the note [here](../installation/authentication.md#recommendations-for-authentication) on why you shouldn't use your own personal account for achiving.
```
Once the extension is installed in your preferred browser, login to Twitter in this browser, and then activate the extension and export the cookies. You can choose to export all your cookies for your browser, or just cookies for this specific site. In the image below, we're only exporting cookies for Twitter/x.com:
![extract cookies](extract_cookies.png)
**3. Adding the cookies file to Auto Archiver**
You now will have a file called `cookies.txt` (tip: name it `twitter_cookies.txt` if you only exported cookies for Twitter), which needs to be added to Auto Archiver.
Do this by going into your Auto Archiver configuration file, and editing the `authentication` section. We will add the `cookies_file` option for the site `x.com,twitter.com`.
```{note} For websites that have multiple URLs (like x.com and twitter.com) you can 'reuse' the same login information without duplicating it using a comma separated list of domain names.
```
I've saved my `twitter_cookies.txt` file in a `secrets` folder, so here's how my authentication section looks now:
```{code} yaml
:caption: orchestration.yaml
...
authentication:
x.com,twitter.com:
cookies_file: secrets/twitter_cookies.txt
...
```
**4. Re-run your archiving with the cookies enabled**
Now, the next time we re-run Auto Archiver, the cookies from our logged-in session will be used by Auto Archiver, and restricted/sensitive tweets can be downloaded!
```{code} console
>>> auto-archiver https://x.com/SozinhoRamalho/status/1876710769913450647 ✭ ✱ ◼
...
2025-02-20 15:27:46.785 | WARNING | auto_archiver.modules.console_db.console_db:started:13 - STARTED Metadata(status='no archiver', metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 27, 46, 785304, tzinfo=datetime.timezone.utc), 'url': 'https://x.com/SozinhoRamalho/status/1876710769913450647'}, media=[])
2025-02-20 15:27:46.785 | INFO | auto_archiver.core.orchestrator:archive:483 - Trying extractor generic_extractor for https://x.com/SozinhoRamalho/status/1876710769913450647
[twitter] Extracting URL: https://x.com/SozinhoRamalho/status/1876710769913450647
...
2025-02-20 15:27:53.134 | INFO | auto_archiver.modules.local_storage.local_storage:upload:26 - ./local_archive/https-x-com-sozinhoramalho-status-1876710769913450647/06e8bacf27ac4bb983bf6280.html
2025-02-20 15:27:53.135 | SUCCESS | auto_archiver.modules.console_db.console_db:done:23 - DONE Metadata(status='yt-dlp_Twitter: success',
metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 27, 48, 564738, tzinfo=datetime.timezone.utc), 'url':
'https://x.com/SozinhoRamalho/status/1876710769913450647', 'title': 'ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1',
...
```
### Finishing Touches
You've now successfully exported your cookies from a logged-in session in your browser, and used them to authenticate with Twitter and download a sensitive tweet. Congratulations!
Finally,Some important things to remember:
1. It's best not to use your own personal account for archiving. [Here's why](../installation/authentication.md#recommendations-for-authentication).
2. Cookies can be short-lived, so may need updating. Sometimes, a website session may 'expire' or a website may force you to login again. In these instances, you'll need to repeat the export step (step 2) after logging in again to update your cookies.
## Authenticating on XXXX site with username/password
```{note} This section is still under construction 🚧
```

Plik binarny nie jest wyświetlany.

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 944 KiB

Wyświetl plik

@ -0,0 +1,122 @@
# Using Google Sheets
This guide explains how to set up Google Sheets to process URLs automatically and then store the archiving status back into the Google sheet. It is broadly split into 3 steps:
1. Setting up your Google Sheet
2. Setting up a service account so Auto Archiver can access the sheet
3. Setting the Auto Archiver settings
### 1. Setting up your Google Sheet
Any Google sheet must have at least *one* column, with the name 'link' (you can change this name afterwards). This is the column with the URLs that you want the Auto Archiver to archive. Your sheet can have many other columns that the Auto Archiver can use, and you can also include any other columns for your own personal use.
We recommend copying [this template Google Sheet](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?usp=sharing) as a starting point for your project.
Here's an overview of all the columns, and what a complete sheet would look like.
Inputs:
* **Link** *(required)*: the URL of the post to archive
* **Destination folder**: custom folder for archived file (regardless of storage)
Outputs:
* **Archive status** *(required)*: Status of archive operation
* **Archive location**: URL of archived post
* **Archive date**: Date archived
* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet
* **Timestamp**: Timestamp of original post
* **Title**: Post title
* **Text**: Post text
* **Screenshot**: Link to screenshot of post
* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
* **WACZ**: Link to a WACZ web archive of post
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.)
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png)
We'll change the name of the 'Destination Folder' column in step 3.
## 2. Setting up your Service Account
Once your Google Sheet is set up, you need to create what's called a 'service account' that will allow the Auto Archiver to access it.
To do this, follow the steps in [this guide](https://gspread.readthedocs.io/en/latest/oauth2.html) all the way up until step 8. You should have downloaded a file called `service_account.json` and shared the Google Sheet with the log 'client_email' email address in this file.
Once you've downloaded the file, save it to `secrets/service_account.json`
## 3. Setting up the configuration file
Now that you've set up your Google sheet, and you've set up the service account so Auto Archiver can access the sheet, the final step is to set your configuration.
First, make sure you have `gsheet_feeder` set in the `steps.feeders` section of your config. If you wish to store the results of the archiving process back in your Google sheet, make sure to also set the `ghseet_db` settig in the `steps.databases` section. Here's how this might look:
```{code} yaml
steps:
feeders:
- gsheet_feeder
...
databases:
- gsheet_db # optional, if you also want to store the results in the Google sheet
...
```
Next, set up the `gsheet_feeder` configuration settings in the 'Configurations' part of the config `orchestration.yaml` file. Open up he file, and set the `gsheet_feeder.sheet` setting or the `gsheet_feeder.sheet_id` setting. The `sheet` should be the name of your sheet, as it shows in the top left of the sheet. For example, the sheet [here](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) is called 'Public Auto Archiver template'.
Here's how this might look:
```{code} yaml
...
gsheet_feeder:
sheet: 'My Awesome Sheet'
...
```
You can also pass these settings directly on the command line without having to edit the file, here'a an example of how to do that (using docker):
`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder.sheet "Auto archive test 2023-2"`.
Here, the sheet name has been overridden/specified in the command line invocation.
### 3a. (Optional) Changing the column names
In step 1, we said we would change the name of the 'Destination Folder'. Perhaps you don't like this name, or already have a sheet with a different name. In our example here, we want to name this column 'Save Folder'. To do this, we need to edit the `ghseet_feeder.column` setting in the configuration file. For more information on this setting, see the [Gsheet Feeder docs](../modules/autogen/feeder/gsheet_feeder.md#configuration-options). We will first copy the default settings from the Gsheet Feeder docs for the 'column' settings, and then edit the 'Destination Folder' section to rename it 'Save Folder'. Our final configuration section looks like:
```{code} yaml
...
gsheet_feeder:
sheet: 'My Awesome Sheet'
columns:
url: link
status: archive status
folder: save folder # <-- note how this value has been changed
archive: archive location
date: archive date
thumbnail: thumbnail
timestamp: upload timestamp
title: upload title
text: text content
screenshot: screenshot
hash: hash
pdq_hash: perceptual hashes
wacz: wacz
replaywebpage: replaywebpage
```
## Viewing the Results after archiving
With the `ghseet_db` installed, once you start running the Auto Archiver, it will updates the "Archive status" column.
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png)
The links are downloaded and archived, and the spreadsheet is updated to the following:
![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png)
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive.
![The archive result for a link in the demo sheet.](../demo-archive.png)

Wyświetl plik

@ -0,0 +1,71 @@
# Keeping Logs
Auto Archiver's logs can be helpful for debugging problematic archiving processes. This guide shows you how to use the logs to
## Setting up logging
Logging settings can be set on the command line or using the orchestration config file ([learn more](../installation/configuration)). A special `logging` section defines the logging options.
#### Enabling or Disabling Logging
Logging to the console is enabled by default. If you want to globally disable Auto Archiver's logging, then you can set `enabled: false` in your `logging` config:
```{code} yaml
...
logging:
enabled: false
...
```
```{note}
This will disable all logs from Auto Archiver, but it does not disable logs for other tools that the Auto Archiver uses (for example: yt-dlp, firefox or ffmpeg). These logs will still appear in your console.
```
#### Logging Level
There are 7 logging levels in total, with 4 commonly used levels. They are: `DEBUG`, `INFO`, `WARNING` and `ERROR`.
Change the warning level by setting the value in your orchestration config file:
```{code} yaml
:caption: orchestration.yaml
...
logging:
level: DEBUG # or INFO / WARNING / ERROR
...
```
For normal usage, it is recommended to use the `INFO` level, or if you prefer quieter logs with less information, you can use the `WARNING` level. If you encounter issues with the archiving, then it's recommended to enable the `DEBUG` level.
```{note} To learn about all logging levels, see the [loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html)
```
### Logging to a file
As default, auto-archiver will log to the console. But if you wish to store your logs for future reference, or you are running the auto-archiver from within code a implementation, then you may with to enable file logging. This can be done by setting the `file:` config value in the logging settings.
**Rotation:** For file logging, you can choose to 'rotate' your log files (creating new log files) so they do not get too large. Change this by setting the 'rotation' option in your logging settings. For a full list of rotation options, see the [loguru docs](https://loguru.readthedocs.io/en/stable/overview.html#easier-file-logging-with-rotation-retention-compression).
```{code} yaml
:caption: orchestration.yaml
logging:
...
file: /my/log/file.log
rotation: 1 day
```
### Full logging example
The below example logs only `WARNING` logs to the console and to the file `/my/file.log`, rotating that file once per week:
```{code} yaml
:caption: orchestration.yaml
logging:
level: WARNING
file: /my/file.log
rotation: 1 week
```

Wyświetl plik

@ -0,0 +1,127 @@
# Upgrading to v0.13
```{note} This how-to is only relevant for people who used Auto Archiver before February 2025 (versions prior to 0.13).
If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you.
```
Version 0.13 of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications.
## How do I know if I need to update my configuration format?
There are two simple ways to check if you need to update your format:
1. When you try and run auto-archiver using your existing configuration file, you get an error about no feeders or formatters being configured, like:
```{code} console
AssertionError: No feeders were configured. Make sure to set at least one feeder in
your configuration file or on the command line (using --feeders)
```
2. Within your configuration file, you have a `feeder:` option. This is the old format. An example old format:
```{code} yaml
steps:
feeder: gsheet_feeder
...
```
## Updating your configuration file
To update your configuration file, you can either:
### 1. Manually edit the configuration file and change the values.
This is recommended if you want to keep all your old settings. Follow the steps below to change the relevant settings:
#### a) Feeder & Formatter Steps Settings
The feeder and formatter settings have been changed from a single string to a list.
- `steps.feeder (string)``steps.feeders (list)`
- `steps.formatter (string)``steps.formatters (list)`
Example:
```{code} yaml
steps:
feeder: cli_feeder
...
formatter: html_formatter
# the above should be changed to:
steps:
feeders:
- cli_feeder
...
formatters:
- html_formatter
```
```{note} Auto Archiver still only supports one feeder and formatter, but from v0.13 onwards they must be added to the configuration file as a list.
```
#### b) Extractor (formerly Archiver) Steps Settings
With v0.13 of Auto Archiver, `archivers` have been renamed to `extractors` to better reflect what they actually do - extract information from a URL. Change the configuration by renaming:
- `steps.archivers``steps.extractors`
The names of the actual modules have also changed, so for any extractor modules you have enabled, you will need to rename the `archiver` part to `extractor`. Some examples:
- `telethon_archiver``telethon_extractor`
- `wacz_archiver_enricher``wacz_extractor_enricher`
- `wayback_archiver_enricher``wayback_extractor_enricher`
- `vk_archiver``vk_extractor`
Additionally, the `youtube_archiver` has been renamed to `generic_extractor` as it is considered the default/fallback extractor. Read more about the [generic extractor](../modules/autogen/extractor/generic_extractor.md).
Example:
```{code} yaml
steps:
...
archivers:
- telethon_archiver
- youtube_archiver
- vk_archiver
# renaming 'archiver' to 'extractor', and renaming the youtube_archiver the above config will become:
steps:
...
extractors:
- telethon_extractor
- vk_extractor
- generic_extractor
```
#### c) Redundant / Obsolete Modules
With v0.13 of Auto Archiver, the following modules have been removed and their features have been built in to the generic_extractor. You should remove them from the 'steps' section of your configuration file:
* `twitter_archiver` - use the `generic_extractor` for general extraction, or the `twitter_api_extractor` for API access.
* `tiktok_archiver` - use the `generic_extractor` to extract TikTok videos.
### 2. Auto-generate a new config, then copy over your settings.
Using this method, you can have Auto Archiver auto-generate a configuration file for you, then you can copy over the desired settings from your old config file. This is probably the easiest method and quickest to setup, but it may require some trial and error as you copy over your settings.
First, move your existing `orchestration.yaml` file to a different folder or rename it.
Then, you can generate a `simple` or `full` config using:
```{code} console
>>> # generate a simple config
>>> auto-archiver
>>> # config will be written to orchestration.yaml
>>>
>>> # generate a full config
>>> auto-archiver --mode=full
>>>
```
After this, copy over any settings from your old config to the new config.

Wyświetl plik

@ -8,10 +8,10 @@
:caption: Contents:
Overview <self>
contributing
installation/installation.rst
installation/setup
core_modules.md
how_to
contributing
development/developer_guidelines
autoapi/index.rst
```

Wyświetl plik

@ -4,22 +4,42 @@ The Authentication framework for auto-archiver allows you to add login details f
There are two main use cases for authentication:
* Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this.
* Some websites use anti-bot systems to block bot-like tools from accessing the website. Adding real login information to auto-archiver can sometimes bypass this.
## The Authentication Config
You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same.
You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. Currently, auto-archiver supports the following authentication types:
**Username & Password:**
- `username`: str - the username to use for login
- `password`: str - the password to use for login
**API**
- `api_key`: str - the API key to use for login
- `api_secret`: str - the API secret to use for login
**Cookies**
- `cookie`: str - a cookie string to use for login (specific to this site)
- `cookies_from_browser`: str - load cookies from this browser, for this site only.
- `cookies_file`: str - load cookies from this file, for this site only.
```{note}
The Username & Password, and API settings only work with the Generic Extractor. Other modules (like the screenshot enricher) can only use the `cookies` options. Furthermore, many sites can still detect bots and block username/password logins. Twitter/X and YouTube are two prominent ones that block username/password logging.
One of the 'Cookies' options is recommended for the most robust archiving.
```
```{code} yaml
authentication:
# optional file to load authentication information from, for security or multi-system deploy purposes
load_from_file: path/to/authentication/file.txt
# optional setting to load cookies from the named browser on the system.
# optional setting to load cookies from the named browser on the system, for **ALL** websites
cookies_from_browser: firefox
# optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these
# optional setting to load cookies from a cookies.txt/cookies.jar file, for **ALL** websites. See note below on extracting these
cookies_file: path/to/cookies.jar
twitter.com,x.com:
mysite.com:
username: myusername
password: 123
@ -29,15 +49,10 @@ authentication:
othersite.com:
api_key: 123
api_secret: 1234
# All available options:
# - username: str - the username to use for login
# - password: str - the password to use for login
# - api_key: str - the API key to use for login
# - api_secret: str - the API secret to use for login
# - cookie: str - a cookie string to use for login (specific to this site)
```
### Recommendations for authentication
1. **Store authentication information separately:**

Wyświetl plik

@ -1,13 +1,18 @@
# Configuration
This section of the documentation provides guidelines for configuring the tool.
The recommended way to configure auto-archiver for first-time users is to [run the Auto Archiver](setup.md#running) and have it auto-generate a default configuration for you. Then, if needed, you can edit the configuration file using one of the following methods.
## Configuring using a file
The recommended way to configure auto-archiver for long-term and deployed projects is a configuration file, typically called `orchestration.yaml`. This is a YAML file containing all the settings for your entire workflow.
## 1. Configuration file
The structure of orchestration file is split into 2 parts: `steps` (what [steps](../flow_overview.md) to use) and `configurations` (settings for different modules), here's a simplification:
The configuration file is typically called `orchestration.yaml` and stored in the `secrets` folder on your desktop. The configuration file contains all the settings for your entire Auto Archiver workflow in one easy-to-find place.
If you want to have Auto Archiver run with the recommended 'basic' setup,
### Advanced Configuration
The structure of orchestration file is split into 2 parts: `steps` (what [steps](../flow_overview.md) to use) and `configurations` (settings for individual modules).
A default `orchestration.yaml` will be created for you the first time you run auto-archiver (without any arguments). Here's what it looks like:
@ -21,9 +26,9 @@ A default `orchestration.yaml` will be created for you the first time you run au
</details>
## Configuring from the Command Line
## 2. Command Line configuration
You can run auto-archiver directy from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`.
You can run auto-archiver directly from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`.
The command line arguments are useful for testing or editing config values and enabling/disabling modules on the fly. When you are happy with your settings, you can store them back in your configuration file by passing the `-s/--store` flag on the command line.

Wyświetl plik

@ -1,80 +1,44 @@
# Installing Auto Archiver
# Installation
```{toctree}
:depth: 1
:hidden:
There are 3 main ways to use the auto-archiver. We recommend the 'docker' method for most uses. This installs all the requirements in one command.
configurations.md
config_cheatsheet.md
```
There are 3 main ways to use the auto-archiver:
1. Easiest: [via docker](#installing-with-docker)
1. Easiest (recommended): [via docker](#installing-with-docker)
2. Local Install: [using pip](#installing-locally-with-pip)
3. Developer Install: [see the developer guidelines](../development/developer_guidelines)
But **you always need a configuration/orchestration file**, which is where you'll configure where/what/how to archive. Make sure you read [orchestration](#orchestration).
## Installing with Docker
## 1. Installing with Docker
[![dockeri.co](https://dockerico.blankenship.io/image/bellingcat/auto-archiver)](https://hub.docker.com/r/bellingcat/auto-archiver)
Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag.
Docker works like a virtual machine running inside your computer, making installation simple. You'll need to first set up Docker, and then download the Auto Archiver 'image':
1. Install [docker](https://docs.docker.com/get-docker/)
2. Pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
3. Run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
1. `docker run` tells docker to start a new container (an instance of the image)
2. `--rm` makes sure this container is removed after execution (less garbage locally)
3. `-v $PWD/secrets:/app/secrets` - your secrets folder
1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container
2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use
3. `/app/secrets` points to the path the docker container where this image can be found
4. `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage
1. `-v` same as above, this is a volume instruction
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
**a) Download and install docker**
### Example invocations
Go to the [Docker website](https://docs.docker.com/get-docker/) and download right version for your operating system.
The invocations below will run the auto-archiver Docker image using a configuration file that you have specified
**b) Pull the Auto Archiver docker image**
Open your command line terminal, and copy-paste / type:
```bash
# all the configurations come from ./secrets/orchestration.yaml
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml
# uses the same configurations but for another google docs sheet
# with a header on row 2 and with some different column names
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
docker pull bellingcat/auto-archiver
```
## Installing Locally with Pip
This will download the docker image, which may take a while.
That's it, all done! You're now ready to set up [your configuration file](configurations.md). Or, if you want to use the recommended defaults, then you can [run Auto Archiver immediately](setup.md#running-a-docker-install).
------------
## 2. Installing Locally with Pip
1. Make sure you have python 3.10 or higher installed
2. Install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver`
3. Test it's installed with `auto-archiver --help`
4. Install other local dependency requirements (for )
5. Run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
4. Install other local dependency requirements (for example `ffmpeg`, `firefox`)
### Example invocations
Once all your [local requirements](#installing-local-requirements) are correctly installed, the
```bash
# all the configurations come from ./secrets/orchestration.yaml
auto-archiver --config secrets/orchestration.yaml
# uses the same configurations but for another google docs sheet
# with a header on row 2 and with some different column names
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
```
After this, you're ready to set up your [your configuration file](configurations.md), or if you want to use the recommended defaults, then you can [run Auto Archiver immediately](setup.md#running-a-local-install).
### Installing Local Requirements

Wyświetl plik

@ -0,0 +1,14 @@
# Requirements
Using the Auto Archiver is very simple, but ideally you have some familiarity with using the command line to run programs. ([Command line crash course](https://developer.mozilla.org/en-US/docs/Learn_web_development/Getting_started/Environment_setup/Command_line)).
### System Requirements
* Auto Archiver works on any Windows, macOS and Linux computer
* If you're using the **local install** method, then you should make sure to have python3.10+ installed
### Storage Requirements
By default, Auto Archiver uses your local computer storage for any downloaded media (videos, images etc.). If you're downloading large files, this may take up a lot of your local computer's space (more than 5GB of space).
If your storage space is limited, then you may want to set up an [alternative storage method](../modules/storage.md) for your media.

Wyświetl plik

@ -0,0 +1,76 @@
# Getting Started
```{toctree}
:maxdepth: 1
:hidden:
installation.md
configurations.md
authentication.md
requirements.md
config_cheatsheet.md
```
## Getting Started
To get started with Auto Archiver, there are 3 main steps you need to complete.
1. [Install Auto Archiver](installation.md)
2. [Setup up your configuration](configurations.md) (if you are ok with the default settings, you can skip this step)
3. Run the archiving process<a id="running"></a>
The way you run the Auto Archiver depends on how you installed it (docker install or local install)
### Running a Docker Install
If you installed Auto Archiver using docker, open up your terminal, and copy-paste / type the following command:
```bash
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
```
breaking this command down:
1. `docker run` tells docker to start a new container (an instance of the image)
2. `--rm` makes sure this container is removed after execution (less garbage locally)
3. `-v $PWD/secrets:/app/secrets` - your secrets folder with settings
1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container
2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use
3. `/app/secrets` points to the path the docker container where this image can be found
4. `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage
1. `-v` same as above, this is a volume instruction
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
### Example invocations
The invocations below will run the auto-archiver Docker image using a configuration file that you have specified
```bash
# Have auto-archiver run with the default settings, generating a settings file in ./secrets/orchestration.yaml
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
# uses the same configuration, but with the `gsheet_feeder`, a header on row 2 and with some different column names
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# Runs auto-archiver for the first time, but in 'full' mode, enabling all modules to get a full settings file
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --mode full
```
------------
### Running a Local Install
### Example invocations
Once all your [local requirements](#installing-local-requirements) are correctly installed, the
```bash
# all the configurations come from ./secrets/orchestration.yaml
auto-archiver --config secrets/orchestration.yaml
# uses the same configurations but for another google docs sheet
# with a header on row 2 and with some different column names
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
```

Wyświetl plik

@ -8,7 +8,7 @@ The default (enabled) databases are the CSV Database and the Console Database.
```
```{toctree}
:depth: 1
:maxdepth: 1
:hidden:
:glob:
autogen/database/*

Wyświetl plik

@ -7,7 +7,7 @@ Enricher modules are used to add additional information to the items that have
```
```{toctree}
:depth: 1
:maxdepth: 1
:hidden:
:glob:
autogen/enricher/*

Wyświetl plik

@ -4,14 +4,14 @@ Extractor modules are used to extract the content of a given URL. Typically, one
Extractors that are able to extract content from a wide range of websites include:
1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library.
2. Wayback Machine Extractor: sends pages to the Waygback machine for archiving, and stores the link.
2. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the link.
3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format.
```{include} autogen/extractor.md
```
```{toctree}
:depth: 1
:maxdepth: 1
:hidden:
:glob:
autogen/extractor/*

Wyświetl plik

@ -1,8 +1,8 @@
# Feeder Modules
Feeder modules are used to feed URLs into the `auto-archiver` for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line.
Feeder modules are used to feed URLs into the Auto Archiver for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line.
The default feeder is the command line feeder (`cli_feeder`), which allows you to input URLs directly into the `auto-archiver` from the command line.
The default feeder is the command line feeder (`cli_feeder`), which allows you to input URLs directly into `auto-archiver` from the command line.
Command line feeder usage:
```{code} bash
@ -13,7 +13,7 @@ auto-archiver [options] -- URL1 URL2 ...
```
```{toctree}
:depth: 1
:maxdepth: 1
:glob:
:hidden:
autogen/feeder/*

Wyświetl plik

@ -6,7 +6,7 @@ Formatter modules are used to format the data extracted from a URL into a specif
```
```{toctree}
:depth: 1
:maxdepth: 1
:hidden:
:glob:
autogen/formatter/*

Wyświetl plik

@ -8,7 +8,7 @@ The default is to store the files downloaded (e.g. images, videos) in a local di
```
```{toctree}
:depth: 1
:maxdepth: 1
:hidden:
:glob:
autogen/storage/*

Wyświetl plik

@ -51,7 +51,6 @@ class BaseModule(ABC):
def config_setup(self, config: dict):
authentication = config.get('authentication', {})
# this is important. Each instance is given its own deepcopied config, so modules cannot
# change values to affect other modules
config = deepcopy(config)
@ -86,11 +85,13 @@ class BaseModule(ABC):
* api_key: str - the API key to use for login\n
* api_secret: str - the API secret to use for login\n
* cookie: str - a cookie string to use for login (specific to this site)\n
* cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
* cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
"""
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
site = UrlUtil.domain_for_url(site)
site = UrlUtil.domain_for_url(site).lstrip("www.")
# add the 'www' version of the site to the list of sites to check
authdict = {}
@ -116,17 +117,30 @@ class BaseModule(ABC):
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
get_cookiejar_options = None
# get the cookies jar, prefer the browser cookies than the file
if 'cookies_from_browser' in self.authentication:
# order of priority:
# 1. cookies_from_browser setting in site config
# 2. cookies_file setting in site config
# 3. cookies_from_browser setting in global config
# 4. cookies_file setting in global config
if 'cookies_from_browser' in authdict:
get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
elif 'cookies_file' in authdict:
get_cookiejar_options = ['--cookies', authdict['cookies_file']]
elif 'cookies_from_browser' in self.authentication:
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
if extract_cookies:
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
elif 'cookies_file' in self.authentication:
authdict['cookies_file'] = self.authentication['cookies_file']
if extract_cookies:
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
if get_cookiejar_options:
authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
return authdict
def repr(self):

Wyświetl plik

@ -7,6 +7,7 @@ flexible setup in various environments.
import argparse
from ruamel.yaml import YAML, CommentedMap, add_representer
import json
from loguru import logger
@ -17,10 +18,12 @@ from typing import Any, List, Type, Tuple
_yaml: YAML = YAML()
DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined
# Steps are the modules that will be run in the order they are defined
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
"""
@ -52,6 +55,57 @@ logging:
""")
# note: 'logging' is explicitly added above in order to better format the config file
# Arg Parse Actions/Classes
class AuthenticationJsonParseAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
try:
auth_dict = json.loads(values)
setattr(namespace, self.dest, auth_dict)
except json.JSONDecodeError as e:
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
def load_from_file(path):
try:
with open(path, 'r') as f:
try:
auth_dict = json.load(f)
except json.JSONDecodeError:
f.seek(0)
# maybe it's yaml, try that
auth_dict = _yaml.load(f)
if auth_dict.get('authentication'):
auth_dict = auth_dict['authentication']
auth_dict['load_from_file'] = path
return auth_dict
except:
return None
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
auth_dict = load_from_file(auth_dict['from_file'])
elif isinstance(auth_dict, str):
# if it's a string
auth_dict = load_from_file(auth_dict)
if not isinstance(auth_dict, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
for key, auth in auth_dict.items():
if key in global_options:
continue
if not isinstance(key, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
setattr(namespace, self.dest, auth_dict)
class UniqueAppendAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
for value in values:
if value not in getattr(namespace, self.dest):
getattr(namespace, self.dest).append(value)
class DefaultValidatingParser(argparse.ArgumentParser):
def error(self, message):
@ -82,6 +136,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
return super().parse_known_args(args, namespace)
# Config Utils
def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
dotdict = {}
@ -153,8 +208,8 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
pass
if not config:
config = EMPTY_CONFIG
config = deepcopy(EMPTY_CONFIG)
return config
# TODO: make this tidier/find a way to notify of which keys should not be stored
@ -170,4 +225,7 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save.pop('urls', None)
with open(yaml_filename, "w", encoding="utf-8") as outf:
_yaml.dump(config_to_save, outf)
_yaml.dump(config_to_save, outf)
def is_valid_config(config: CommentedMap) -> bool:
return config and config != EMPTY_CONFIG

Wyświetl plik

@ -13,7 +13,7 @@ from abc import abstractmethod
from auto_archiver.core import Metadata, BaseModule
class Enricher(BaseModule):
"""Base classes and utilities for enrichers in the Auto-Archiver system.
"""Base classes and utilities for enrichers in the Auto Archiver system.
Enricher modules must implement the `enrich` method to define their behavior.
"""

Wyświetl plik

@ -134,7 +134,6 @@ class LazyBaseModule:
"""
name: str
type: list
description: str
path: str
module_factory: ModuleFactory
@ -148,6 +147,10 @@ class LazyBaseModule:
self.path = path
self.module_factory = factory
@property
def type(self):
return self.manifest['type']
@property
def entry_point(self):
if not self._entry_point and not self.manifest['entry_point']:
@ -183,10 +186,9 @@ class LazyBaseModule:
try:
manifest.update(ast.literal_eval(f.read()))
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
self._manifest = manifest
self.type = manifest['type']
self._entry_point = manifest['entry_point']
self.description = manifest['description']
self.version = manifest['version']
@ -254,7 +256,7 @@ class LazyBaseModule:
instance.module_factory = self.module_factory
# merge the default config with the user config
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)
config[self.name] = default_config | config.get(self.name, {})
instance.config_setup(config)

Wyświetl plik

@ -6,95 +6,31 @@
from __future__ import annotations
from typing import Generator, Union, List, Type, TYPE_CHECKING
from urllib.parse import urlparse
from ipaddress import ip_address
from copy import copy
import argparse
import os
import sys
import json
from tempfile import TemporaryDirectory
import traceback
from copy import copy
from rich_argparse import RichHelpFormatter
from loguru import logger
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .consts import MODULE_TYPES
from loguru import logger
from auto_archiver.utils.url import check_url_or_raise
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
DEFAULT_CONFIG_FILE = "orchestration.yaml"
class JsonParseAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
try:
setattr(namespace, self.dest, json.loads(values))
except json.JSONDecodeError as e:
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
class AuthenticationJsonParseAction(JsonParseAction):
def __call__(self, parser, namespace, values, option_string=None):
super().__call__(parser, namespace, values, option_string)
auth_dict = getattr(namespace, self.dest)
def load_from_file(path):
try:
with open(path, 'r') as f:
try:
auth_dict = json.load(f)
except json.JSONDecodeError:
f.seek(0)
# maybe it's yaml, try that
auth_dict = _yaml.load(f)
if auth_dict.get('authentication'):
auth_dict = auth_dict['authentication']
auth_dict['load_from_file'] = path
return auth_dict
except:
return None
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
auth_dict = load_from_file(auth_dict['from_file'])
elif isinstance(auth_dict, str):
# if it's a string
auth_dict = load_from_file(auth_dict)
if not isinstance(auth_dict, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
for key, auth in auth_dict.items():
if key in global_options:
continue
if not isinstance(key, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
# extract out concatenated sites
for key, val in copy(auth_dict).items():
if "," in key:
for site in key.split(","):
auth_dict[site] = val
del auth_dict[key]
setattr(namespace, self.dest, auth_dict)
class UniqueAppendAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
for value in values:
if value not in getattr(namespace, self.dest):
getattr(namespace, self.dest).append(value)
class SetupError(ValueError):
pass
class ArchivingOrchestrator:
# instance variables
@ -163,7 +99,7 @@ class ArchivingOrchestrator:
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
# but should we add them? Or should we just add them to the 'complete' parser?
if yaml_config != EMPTY_CONFIG:
if is_valid_config(yaml_config):
# only load the modules enabled in config
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = []
@ -189,7 +125,13 @@ class ArchivingOrchestrator:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else:
# load all modules, they're not using the 'simple' mode
self.add_individual_module_args(self.module_factory.available_modules(), parser)
all_modules = self.module_factory.available_modules()
# add all the modules to the steps
for module in all_modules:
for module_type in module.type:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
self.add_individual_module_args(all_modules, parser)
parser.set_defaults(**to_dot_notation(yaml_config))
@ -198,6 +140,9 @@ class ArchivingOrchestrator:
# merge the new config with the old one
config = merge_dicts(vars(parsed), yaml_config)
# set up the authentication dict as needed
config = self.setup_authentication(config)
# clean out args from the base_parser that we don't want in the config
for key in vars(basic_config):
config.pop(key, None)
@ -286,14 +231,20 @@ class ArchivingOrchestrator:
self.basic_parser.exit()
def setup_logging(self, config):
logging_config = config['logging']
if logging_config.get('enabled', True) is False:
# disabled logging settings, they're set on a higher level
logger.disable('auto_archiver')
return
# setup loguru logging
try:
logger.remove(0) # remove the default logger
except ValueError:
pass
logging_config = config['logging']
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
@ -312,27 +263,25 @@ class ArchivingOrchestrator:
step_items = []
modules_to_load = modules_by_type[f"{module_type}s"]
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
if not modules_to_load:
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
def check_steps_ok():
if not len(step_items):
logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
if len(modules_to_load):
logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}")
exit()
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
exit()
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
for module in modules_to_load:
if module == 'cli_feeder':
# pseudo module, don't load it
# cli_feeder is a pseudo module, it just takes the command line args for [URLS]
urls = self.config['urls']
if not urls:
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
exit()
# cli_feeder is a pseudo module, it just takes the command line args
raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
def feed(self) -> Generator[Metadata]:
for url in urls:
@ -352,13 +301,14 @@ class ArchivingOrchestrator:
if module in invalid_modules:
continue
try:
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module:
loaded_module.cleanup()
exit()
raise e
if not loaded_module:
invalid_modules.append(module)
@ -372,7 +322,7 @@ class ArchivingOrchestrator:
def load_config(self, config_file: str) -> dict:
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
exit()
raise FileNotFoundError(f"Configuration file {config_file} not found")
return read_yaml(config_file)
@ -437,8 +387,12 @@ class ArchivingOrchestrator:
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
To test configurations, without loading any modules you can also first call 'setup_configs'
"""
self.setup(args)
return self.feed()
try:
self.setup(args)
return self.feed()
except Exception as e:
logger.error(e)
exit(1)
def cleanup(self) -> None:
logger.info("Cleaning up")
@ -503,8 +457,8 @@ class ArchivingOrchestrator:
original_url = result.get_url().strip()
try:
self.assert_valid_url(original_url)
except AssertionError as e:
check_url_or_raise(original_url)
except ValueError as e:
logger.error(f"Error archiving URL {original_url}: {e}")
raise e
@ -564,26 +518,27 @@ class ArchivingOrchestrator:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
return result
def assert_valid_url(self, url: str) -> bool:
def setup_authentication(self, config: dict) -> dict:
"""
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
Setup authentication for all modules that require it
Split up strings into multiple sites if they are comma separated
"""
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
parsed = urlparse(url)
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
assert parsed.hostname, f"Invalid URL hostname"
assert parsed.hostname != "localhost", f"Invalid URL"
authentication = config.get('authentication', {})
try: # special rules for IP addresses
ip = ip_address(parsed.hostname)
except ValueError: pass
else:
assert ip.is_global, f"Invalid IP used"
assert not ip.is_reserved, f"Invalid IP used"
assert not ip.is_link_local, f"Invalid IP used"
assert not ip.is_private, f"Invalid IP used"
# extract out concatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):
site = site.strip()
authentication[site] = val
del authentication[key]
config['authentication'] = authentication
return config
# Helper Properties

Wyświetl plik

@ -1,6 +1,7 @@
# used as validators for config values. Should raise an exception if the value is invalid.
from pathlib import Path
import argparse
import json
def example_validator(value):
if "example" not in value:
@ -16,4 +17,7 @@ def positive_number(value):
def valid_file(value):
if not Path(value).is_file():
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
return value
return value
def json_loader(cli_val):
return json.loads(cli_val)

Wyświetl plik

@ -1,5 +1,5 @@
{
"name": "Auto-Archiver API Database",
"name": "Auto Archiver API Database",
"type": ["database"],
"entry_point": "api_db::AAApiDb",
"requires_setup": True,
@ -39,7 +39,7 @@
},
},
"description": """
Provides integration with the Auto-Archiver API for querying and storing archival data.
Provides integration with the Auto Archiver API for querying and storing archival data.
### Features
- **API Integration**: Supports querying for existing archives and submitting results.
@ -49,6 +49,6 @@
- **Optional Storage**: Archives results conditionally based on configuration.
### Setup
Requires access to an Auto-Archiver API instance and a valid API token.
Requires access to an Auto Archiver API instance and a valid API token.
""",
}

Wyświetl plik

@ -280,6 +280,7 @@ class GenericExtractor(Extractor):
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:
@ -290,11 +291,11 @@ class GenericExtractor(Extractor):
logger.debug(f'Using provided auth cookie for {url}')
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
elif 'cookies_from_browser' in auth:
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
elif 'cookies_file' in auth:
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
ydl_options['cookiesfile'] = auth['cookies_file']
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
ydl_options['cookiefile'] = auth['cookies_file']
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

Wyświetl plik

@ -15,7 +15,8 @@
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
"required": True,
},
"columns": {
"default": {
@ -34,16 +35,16 @@
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"help": "names of columns in the google sheet (stringified JSON object)",
"type": "auto_archiver.utils.json_loader",
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
"type": "json_loader",
},
"allow_worksheets": {
"default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
},
"block_worksheets": {
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed",
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
},
"use_sheet_names_in_stored_paths": {
"default": True,
@ -64,8 +65,10 @@
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
- Supports organizing stored files into folder paths based on sheet and worksheet names.
### Notes
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
- Create the sheet using the template provided in the docs.
### Setup
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
- Customize the column names in your Google sheet using the `columns` configuration.
""",
}

Wyświetl plik

@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder):
def setup(self) -> None:
self.gsheets_client = gspread.service_account(filename=self.service_account)
# TODO mv to validators
assert self.sheet or self.sheet_id, (
"You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
)
if not self.sheet and not self.sheet_id:
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
def open_sheet(self):
if self.sheet:

Wyświetl plik

@ -18,7 +18,7 @@
"channel_invites": {
"default": {},
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
"type": "auto_archiver.utils.json_loader",
"type": "json_loader",
}
},
"description": """

Wyświetl plik

@ -1 +0,0 @@
from .wacz_enricher import WaczExtractorEnricher

Wyświetl plik

@ -0,0 +1 @@
from .wacz_extractor_enricher import WaczExtractorEnricher

Wyświetl plik

@ -1,7 +1,7 @@
{
"name": "WACZ Enricher",
"name": "WACZ Enricher (and Extractor)",
"type": ["enricher", "extractor"],
"entry_point": "wacz_enricher::WaczExtractorEnricher",
"entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
"requires_setup": True,
"dependencies": {
"python": [

Wyświetl plik

@ -1,5 +1,5 @@
{
"name": "Wayback Machine Enricher",
"name": "Wayback Machine Enricher (and Extractor)",
"type": ["enricher", "extractor"],
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
"requires_setup": True,

Wyświetl plik

@ -59,10 +59,6 @@ def random_str(length: int = 32) -> str:
return str(uuid.uuid4()).replace("-", "")[:length]
def json_loader(cli_val):
return json.loads(cli_val)
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
hash = hash_algo()
with open(filename, "rb") as f:

Wyświetl plik

@ -1,5 +1,6 @@
import re
from urllib.parse import urlparse, urlunparse
from ipaddress import ip_address
AUTHWALL_URLS = [
@ -7,6 +8,43 @@ AUTHWALL_URLS = [
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
]
def check_url_or_raise(url: str) -> bool | ValueError:
"""
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
"""
if not (url.startswith("http://") or url.startswith("https://")):
raise ValueError(f"Invalid URL scheme for url {url}")
parsed = urlparse(url)
if not parsed.hostname:
raise ValueError(f"Invalid URL hostname for url {url}")
if parsed.hostname == "localhost":
raise ValueError(f"Localhost URLs cannot be parsed for security reasons (for url {url})")
if parsed.scheme not in ["http", "https"]:
raise ValueError(f"Invalid URL scheme, only http and https supported (for url {url})")
try: # special rules for IP addresses
ip = ip_address(parsed.hostname)
except ValueError:
pass
else:
if not ip.is_global:
raise ValueError(f"IP address {ip} is not globally reachable")
if ip.is_reserved:
raise ValueError(f"Reserved IP address {ip} used")
if ip.is_link_local:
raise ValueError(f"Link-local IP address {ip} used")
if ip.is_private:
raise ValueError(f"Private IP address {ip} used")
return True
def domain_for_url(url: str) -> str:
"""
SECURITY: parse the domain using urllib to avoid any potential security issues

Wyświetl plik

@ -18,7 +18,7 @@ def wacz_enricher(setup_module, mock_binary_dependencies):
"socks_proxy_port": None,
"proxy_server": None,
}
wacz = setup_module("wacz_enricher", configs)
wacz = setup_module("wacz_extractor_enricher", configs)
return wacz

Wyświetl plik

@ -68,7 +68,7 @@ class TestGenericExtractor(TestExtractorBase):
"twitter.com/bellingcat/status/123",
"https://www.youtube.com/watch?v=1"
])
def test_download_nonexistend_media(self, make_item, url):
def test_download_nonexistent_media(self, make_item, url):
"""
Test to make sure that the extractor doesn't break on non-existend posts/media

Wyświetl plik

@ -9,7 +9,7 @@ from auto_archiver.core import Metadata, Feeder
def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
# Ensure setup() raises AssertionError if neither sheet nor sheet_id is set.
mocker.patch("gspread.service_account")
with pytest.raises(AssertionError):
with pytest.raises(ValueError):
setup_module(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},

Wyświetl plik

@ -6,7 +6,9 @@ from auto_archiver.__main__ import main
@pytest.fixture
def orchestration_file_path(tmp_path):
return (tmp_path / "example_orch.yaml").as_posix()
folder = tmp_path / "secrets"
folder.mkdir(exist_ok=True)
return (folder / "example_orch.yaml").as_posix()
@pytest.fixture
def orchestration_file(orchestration_file_path):
@ -28,6 +30,7 @@ def autoarchiver(tmp_path, monkeypatch, request):
logger.add(sys.stderr)
request.addfinalizer(cleanup)
(tmp_path / "secrets").mkdir(exist_ok=True)
# change dir to tmp_path
monkeypatch.chdir(tmp_path)
@ -66,6 +69,7 @@ def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
# monkey patch to change the current working directory, so that we don't use the user's real config file
monkeypatch.chdir(tmp_path)
(tmp_path / "secrets").mkdir(exist_ok=True)
with monkeypatch.context() as m:
m.setattr(sys, "argv", ["auto-archiver"])
with pytest.raises(SystemExit):

Wyświetl plik

@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError
from auto_archiver.core.orchestrator import ArchivingOrchestrator
from auto_archiver.version import __version__
from auto_archiver.core.config import read_yaml, store_yaml
from auto_archiver.core import Metadata
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
TEST_MODULES = "tests/data/test_modules/"
@ -160,4 +160,26 @@ def test_load_settings_for_module_from_commandline(orchestrator, test_args):
assert len(orchestrator.feeders) == 1
assert orchestrator.feeders[0].name == "gsheet_feeder"
assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
def test_multiple_orchestrator(test_args):
o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
o1 = ArchivingOrchestrator()
with pytest.raises(ValueError) as exit_error:
# this should fail because the gsheet_feeder requires a sheet_id / sheet
o1.setup(o1_args)
o2_args = test_args + ["--feeders", "example_module"]
o2 = ArchivingOrchestrator()
o2.setup(o2_args)
assert o2.feeders[0].name == "example_module"
output: Metadata = list(o2.feed())
assert len(output) == 1
assert output[0].get_url() == "https://example.com"