diff --git a/README.md b/README.md index 368a904..b39038e 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,13 @@ Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat. ## Installation -View the [Installation Guide](installation/installation.md) for full instructions +View the [Installation Guide](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html) for full instructions + +**Advanced:** To get started quickly using Docker: -`docker pull bellingcat/auto-archiver && docker run` +`docker pull bellingcat/auto-archiver && docker run --rm -v secrets:/app/secrets bellingcat/auto-archiver --config secrets/orchestration.yaml` Or pip: diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index a5f2998..66ba14d 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -3,6 +3,7 @@ from pathlib import Path from auto_archiver.core.module import ModuleFactory from auto_archiver.core.base_module import BaseModule from ruamel.yaml import YAML +from ruamel.yaml.comments import CommentedMap import io MODULES_FOLDER = Path(__file__).parent.parent.parent.parent / "src" / "auto_archiver" / "modules" @@ -30,6 +31,7 @@ steps: ... {config_string} + """ def generate_module_docs(): @@ -38,8 +40,9 @@ def generate_module_docs(): modules_by_type = {} header_row = "| " + " | ".join(TABLE_HEADER) + "|\n" + "| --- " * len(TABLE_HEADER) + "|\n" - configs_cheatsheet = "\n## Configuration Options\n" - configs_cheatsheet += header_row + global_table = "\n## Configuration Options\n" + header_row + + global_yaml = yaml.load("""\n# Module configuration\nplaceholder: {}""") for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)): # generate the markdown file from the __manifest__.py file. @@ -66,19 +69,30 @@ def generate_module_docs(): config_table = header_row config_yaml = {} + + global_yaml[module.name] = CommentedMap() + global_yaml.yaml_set_comment_before_after_key(module.name, f"\n\n{module.display_name} configuration options") + + for key, value in manifest['configs'].items(): type = value.get('type', 'string') - if type == 'auto_archiver.utils.json_loader': + if type == 'json_loader': value['type'] = 'json' elif type == 'str': type = "string" default = value.get('default', '') config_yaml[key] = default + + global_yaml[module.name][key] = default + + if value.get('help', ''): + global_yaml[module.name].yaml_add_eol_comment(value.get('help', ''), key) + help = "**Required**. " if value.get('required', False) else "Optional. " help += value.get('help', '') config_table += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n" - configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n" + global_table += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n" readme_str += "\n## Configuration Options\n" readme_str += "\n### YAML\n" @@ -103,8 +117,13 @@ def generate_module_docs(): f.write(readme_str) generate_index(modules_by_type) + del global_yaml['placeholder'] + global_string = io.BytesIO() + global_yaml = yaml.dump(global_yaml, global_string) + global_string = global_string.getvalue().decode('utf-8') + global_yaml = f"```yaml\n{global_string}\n```" with open(SAVE_FOLDER / "configs_cheatsheet.md", "w") as f: - f.write(configs_cheatsheet) + f.write("### Configuration File\n" + global_yaml + "\n### Command Line\n" + global_table) def generate_index(modules_by_type): diff --git a/docs/source/bc.png b/docs/source/bc.png new file mode 100644 index 0000000..766529b Binary files /dev/null and b/docs/source/bc.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 5b1ad9b..ee6416e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -3,9 +3,11 @@ import sys import os from importlib.metadata import metadata +from datetime import datetime sys.path.append(os.path.abspath('../scripts')) from scripts import generate_module_docs +from auto_archiver.version import __version__ # -- Project Hooks ----------------------------------------------------------- # convert the module __manifest__.py files into markdown files @@ -15,7 +17,8 @@ generate_module_docs() # -- Project information ----------------------------------------------------- package_metadata = metadata("auto-archiver") project = package_metadata["name"] -authors = "Bellingcat" +copyright = str(datetime.now().year) +author = "Bellingcat" release = package_metadata["version"] language = 'en' @@ -32,7 +35,7 @@ extensions = [ ] templates_path = ['_templates'] -exclude_patterns = [] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ""] # -- AutoAPI Configuration --------------------------------------------------- @@ -76,6 +79,14 @@ source_suffix = { html_theme = 'sphinx_book_theme' html_static_path = ["../_static"] html_css_files = ["custom.css"] +html_title = f"Auto Archiver v{__version__}" +html_logo = "bc.png" +html_theme_options = { + "repository_url": "https://github.com/bellingcat/auto-archiver", + "use_repository_button": True, +} + + copybutton_prompt_text = r">>> |\.\.\." copybutton_prompt_is_regexp = True diff --git a/docs/source/core_modules.md b/docs/source/core_modules.md index 3a8e5ec..58eff08 100644 --- a/docs/source/core_modules.md +++ b/docs/source/core_modules.md @@ -1,8 +1,8 @@ # Module Documentation -These pages describe the core modules that come with `auto-archiver` and provide the main functionality for archiving websites on the internet. There are five core module types: +These pages describe the core modules that come with Auto Archiver and provide the main functionality for archiving websites on the internet. There are five core module types: -1. Feeders - these 'feed' information (the URLs) from various sources to the `auto-archiver` for processing +1. Feeders - these 'feed' information (the URLs) from various sources to the Auto Archiver for processing 2. Extractors - these 'extract' the page data for a given URL that is fed in by a feeder 3. Enrichers - these 'enrich' the data extracted in the previous step with additional information 4. Storage - these 'store' the data in a persistent location (on disk, Google Drive etc.) diff --git a/docs/source/development/creating_modules.md b/docs/source/development/creating_modules.md index 0950251..49468a4 100644 --- a/docs/source/development/creating_modules.md +++ b/docs/source/development/creating_modules.md @@ -1,6 +1,6 @@ # Creating Your Own Modules -Modules are what's used to extend `auto-archiver` to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [Core Modules](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include: +Modules are what's used to extend Auto Archiver to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [Core Modules](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include: 1. Extracting data from a website which doesn't work with the current core extractors. 2. Enriching or altering the data before saving with additional information that the core enrichers do not offer. @@ -21,7 +21,7 @@ When done, you should have a module structure as follows: │ └── awesome_extractor.py ``` -Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the `auto-archiver` repository for examples of the folder structure for real-world modules. +Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the Auto Archiver repository for examples of the folder structure for real-world modules. ## Populating the Manifest File diff --git a/docs/source/development/release.md b/docs/source/development/release.md index 6939e97..403dcb9 100644 --- a/docs/source/development/release.md +++ b/docs/source/development/release.md @@ -6,7 +6,7 @@ 1. Update the version number in [version.py](src/auto_archiver/version.py) 2. Go to github releases > new release > use `vx.y.z` for matching version notation 1. package is automatically updated in pypi - 2. docker image is automatically pushed to dockerhup + 2. docker image is automatically pushed to dockerhub diff --git a/docs/source/how_to.md b/docs/source/how_to.md index 25e1e1d..e2238dd 100644 --- a/docs/source/how_to.md +++ b/docs/source/how_to.md @@ -1,49 +1,6 @@ # How-To Guides -## How to use Google Sheets to load and store archive information -The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. -This sheet must have been shared with the Google Service account used by `gspread`. -This sheet must also have specific columns (case-insensitive) in the `header` - see the [Gsheet Feeder Docs](modules/autogen/feeder/gsheet_feeder.md) for more info. The default names of these columns and their purpose is: - -Inputs: - -* **Link** *(required)*: the URL of the post to archive -* **Destination folder**: custom folder for archived file (regardless of storage) - -Outputs: -* **Archive status** *(required)*: Status of archive operation -* **Archive location**: URL of archived post -* **Archive date**: Date archived -* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet -* **Timestamp**: Timestamp of original post -* **Title**: Post title -* **Text**: Post text -* **Screenshot**: Link to screenshot of post -* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification -* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content -* **WACZ**: Link to a WACZ web archive of post -* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive - -For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.) - -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png) - -Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation. - -When the auto archiver starts running, it updates the "Archive status" column. - -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png) - -The links are downloaded and archived, and the spreadsheet is updated to the following: - -![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png) - -Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. - -The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive. - -![The archive result for a link in the demo sheet.](../demo-archive.png) - +The follow pages contain helpful how-to guides for common use cases of the Auto Archiver. --- ```{toctree} @@ -51,4 +8,5 @@ The "archive location" link contains the path of the archived file, in local sto :glob: how_to/* + ``` \ No newline at end of file diff --git a/docs/source/how_to/authentication_how_to.md b/docs/source/how_to/authentication_how_to.md new file mode 100644 index 0000000..8994271 --- /dev/null +++ b/docs/source/how_to/authentication_how_to.md @@ -0,0 +1,110 @@ +# Logging in to sites + +This how-to guide shows you how you can use various authentication methods to allow you to login to a site you are trying to archive. This is useful for websites that require a user to be logged in to browse them, or for sites that restrict bots. + +In this How-To, we will authenticate on use Twitter/X.com using cookies, and on XXXX using username/password. + + + +## Using cookies to authenticate on Twitter/X + +It can be useful to archive tweets after logging in, since some tweets are only visible to authenticated users. One case is Tweets marked as 'Sensitive'. + +Take this tweet as an example: [https://x.com/SozinhoRamalho/status/1876710769913450647](https://x.com/SozinhoRamalho/status/1876710769913450647) + +This tweet has been marked as sensitive, so a normal run of Auto Archiver without a logged in session will fail to extract the tweet: + +```{code-block} console +:emphasize-lines: 3,4,5,6 + +>>> auto-archiver https://x.com/SozinhoRamalho/status/1876710769913450647 ✭ ✱ + ... +ERROR: [twitter] 1876710769913450647: NSFW tweet requires authentication. Use --cookies, +--cookies-from-browser, --username and --password, --netrc-cmd, or --netrc (twitter) to + provide account credentials. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp + for how to manually pass cookies +[twitter] 1876710769913450647: Downloading guest token +[twitter] 1876710769913450647: Downloading GraphQL JSON +2025-02-20 15:06:13.362 | ERROR | auto_archiver.modules.generic_extractor.generic_extractor:download_for_extractor:248 - Error downloading metadata for post: NSFW tweet requires authentication. Use --cookies, --cookies-from-browser, --username and --password, --netrc-cmd, or --netrc (twitter) to provide account credentials. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies +[generic] Extracting URL: https://x.com/SozinhoRamalho/status/1876710769913450647 +[generic] 1876710769913450647: Downloading webpage +WARNING: [generic] Falling back on generic information extractor +[generic] 1876710769913450647: Extracting information +ERROR: Unsupported URL: https://x.com/SozinhoRamalho/status/1876710769913450647 +2025-02-20 15:06:13.744 | INFO | auto_archiver.core.orchestrator:archive:483 - Trying extractor telegram_extractor for https://x.com/SozinhoRamalho/status/1876710769913450647 +2025-02-20 15:06:13.744 | SUCCESS | auto_archiver.modules.console_db.console_db:done:23 - DONE Metadata(status='nothing archived', metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 6, 12, 473979, tzinfo=datetime.timezone.utc), 'url': 'https://x.com/SozinhoRamalho/status/1876710769913450647'}, media=[]) +... +``` + +To get round this limitation, we can use **cookies** (information about a logged in user) to mimic being logged in to Twitter. There are two ways to pass cookies to Auto Archiver. One is from a file, and the other is from a browser profile on your computer. + +In this tutorial, we will export the Twitter cookies from our browser and add them to Auto Archiver + +**1. Installing a cookie exporter extension** + +First, we need to install an extension in our browser to export the cookies for a certain site. The [FAQ on yt-dlp](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp) provides some suggestions: Get [cookies.txt LOCALLY](https://chromewebstore.google.com/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) for Chrome or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) for Firefox. + +**2. Export the cookies** + +```{note} See the note [here](../installation/authentication.md#recommendations-for-authentication) on why you shouldn't use your own personal account for achiving. +``` + +Once the extension is installed in your preferred browser, login to Twitter in this browser, and then activate the extension and export the cookies. You can choose to export all your cookies for your browser, or just cookies for this specific site. In the image below, we're only exporting cookies for Twitter/x.com: + +![extract cookies](extract_cookies.png) + + +**3. Adding the cookies file to Auto Archiver** + +You now will have a file called `cookies.txt` (tip: name it `twitter_cookies.txt` if you only exported cookies for Twitter), which needs to be added to Auto Archiver. + +Do this by going into your Auto Archiver configuration file, and editing the `authentication` section. We will add the `cookies_file` option for the site `x.com,twitter.com`. + +```{note} For websites that have multiple URLs (like x.com and twitter.com) you can 'reuse' the same login information without duplicating it using a comma separated list of domain names. +``` + +I've saved my `twitter_cookies.txt` file in a `secrets` folder, so here's how my authentication section looks now: + +```{code} yaml +:caption: orchestration.yaml + +... + +authentication: + x.com,twitter.com: + cookies_file: secrets/twitter_cookies.txt +... +``` + +**4. Re-run your archiving with the cookies enabled** + +Now, the next time we re-run Auto Archiver, the cookies from our logged-in session will be used by Auto Archiver, and restricted/sensitive tweets can be downloaded! + +```{code} console +>>> auto-archiver https://x.com/SozinhoRamalho/status/1876710769913450647 ✭ ✱ ◼ +... +2025-02-20 15:27:46.785 | WARNING | auto_archiver.modules.console_db.console_db:started:13 - STARTED Metadata(status='no archiver', metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 27, 46, 785304, tzinfo=datetime.timezone.utc), 'url': 'https://x.com/SozinhoRamalho/status/1876710769913450647'}, media=[]) +2025-02-20 15:27:46.785 | INFO | auto_archiver.core.orchestrator:archive:483 - Trying extractor generic_extractor for https://x.com/SozinhoRamalho/status/1876710769913450647 +[twitter] Extracting URL: https://x.com/SozinhoRamalho/status/1876710769913450647 +... +2025-02-20 15:27:53.134 | INFO | auto_archiver.modules.local_storage.local_storage:upload:26 - ./local_archive/https-x-com-sozinhoramalho-status-1876710769913450647/06e8bacf27ac4bb983bf6280.html +2025-02-20 15:27:53.135 | SUCCESS | auto_archiver.modules.console_db.console_db:done:23 - DONE Metadata(status='yt-dlp_Twitter: success', +metadata={'_processed_at': datetime.datetime(2025, 2, 20, 15, 27, 48, 564738, tzinfo=datetime.timezone.utc), 'url': +'https://x.com/SozinhoRamalho/status/1876710769913450647', 'title': 'ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1', +... +``` + + +### Finishing Touches + +You've now successfully exported your cookies from a logged-in session in your browser, and used them to authenticate with Twitter and download a sensitive tweet. Congratulations! + +Finally,Some important things to remember: + +1. It's best not to use your own personal account for archiving. [Here's why](../installation/authentication.md#recommendations-for-authentication). +2. Cookies can be short-lived, so may need updating. Sometimes, a website session may 'expire' or a website may force you to login again. In these instances, you'll need to repeat the export step (step 2) after logging in again to update your cookies. + +## Authenticating on XXXX site with username/password + +```{note} This section is still under construction 🚧 +``` diff --git a/docs/source/how_to/extract_cookies.png b/docs/source/how_to/extract_cookies.png new file mode 100644 index 0000000..73b7917 Binary files /dev/null and b/docs/source/how_to/extract_cookies.png differ diff --git a/docs/source/how_to/gsheets_setup.md b/docs/source/how_to/gsheets_setup.md new file mode 100644 index 0000000..20cedd5 --- /dev/null +++ b/docs/source/how_to/gsheets_setup.md @@ -0,0 +1,122 @@ +# Using Google Sheets + +This guide explains how to set up Google Sheets to process URLs automatically and then store the archiving status back into the Google sheet. It is broadly split into 3 steps: + +1. Setting up your Google Sheet +2. Setting up a service account so Auto Archiver can access the sheet +3. Setting the Auto Archiver settings + +### 1. Setting up your Google Sheet + +Any Google sheet must have at least *one* column, with the name 'link' (you can change this name afterwards). This is the column with the URLs that you want the Auto Archiver to archive. Your sheet can have many other columns that the Auto Archiver can use, and you can also include any other columns for your own personal use. + +We recommend copying [this template Google Sheet](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?usp=sharing) as a starting point for your project. + +Here's an overview of all the columns, and what a complete sheet would look like. + +Inputs: + +* **Link** *(required)*: the URL of the post to archive +* **Destination folder**: custom folder for archived file (regardless of storage) + +Outputs: +* **Archive status** *(required)*: Status of archive operation +* **Archive location**: URL of archived post +* **Archive date**: Date archived +* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet +* **Timestamp**: Timestamp of original post +* **Title**: Post title +* **Text**: Post text +* **Screenshot**: Link to screenshot of post +* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification +* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content +* **WACZ**: Link to a WACZ web archive of post +* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive + +For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.) + +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png) + +We'll change the name of the 'Destination Folder' column in step 3. + +## 2. Setting up your Service Account + +Once your Google Sheet is set up, you need to create what's called a 'service account' that will allow the Auto Archiver to access it. + +To do this, follow the steps in [this guide](https://gspread.readthedocs.io/en/latest/oauth2.html) all the way up until step 8. You should have downloaded a file called `service_account.json` and shared the Google Sheet with the log 'client_email' email address in this file. + +Once you've downloaded the file, save it to `secrets/service_account.json` + +## 3. Setting up the configuration file + +Now that you've set up your Google sheet, and you've set up the service account so Auto Archiver can access the sheet, the final step is to set your configuration. + +First, make sure you have `gsheet_feeder` set in the `steps.feeders` section of your config. If you wish to store the results of the archiving process back in your Google sheet, make sure to also set the `ghseet_db` settig in the `steps.databases` section. Here's how this might look: + +```{code} yaml +steps: + feeders: + - gsheet_feeder + ... + databases: + - gsheet_db # optional, if you also want to store the results in the Google sheet + ... +``` + +Next, set up the `gsheet_feeder` configuration settings in the 'Configurations' part of the config `orchestration.yaml` file. Open up he file, and set the `gsheet_feeder.sheet` setting or the `gsheet_feeder.sheet_id` setting. The `sheet` should be the name of your sheet, as it shows in the top left of the sheet. For example, the sheet [here](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) is called 'Public Auto Archiver template'. + +Here's how this might look: + +```{code} yaml +... +gsheet_feeder: + sheet: 'My Awesome Sheet' + ... +``` + +You can also pass these settings directly on the command line without having to edit the file, here'a an example of how to do that (using docker): + +`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder.sheet "Auto archive test 2023-2"`. + +Here, the sheet name has been overridden/specified in the command line invocation. + +### 3a. (Optional) Changing the column names + +In step 1, we said we would change the name of the 'Destination Folder'. Perhaps you don't like this name, or already have a sheet with a different name. In our example here, we want to name this column 'Save Folder'. To do this, we need to edit the `ghseet_feeder.column` setting in the configuration file. For more information on this setting, see the [Gsheet Feeder docs](../modules/autogen/feeder/gsheet_feeder.md#configuration-options). We will first copy the default settings from the Gsheet Feeder docs for the 'column' settings, and then edit the 'Destination Folder' section to rename it 'Save Folder'. Our final configuration section looks like: + +```{code} yaml +... +gsheet_feeder: + sheet: 'My Awesome Sheet' + columns: + url: link + status: archive status + folder: save folder # <-- note how this value has been changed + archive: archive location + date: archive date + thumbnail: thumbnail + timestamp: upload timestamp + title: upload title + text: text content + screenshot: screenshot + hash: hash + pdq_hash: perceptual hashes + wacz: wacz + replaywebpage: replaywebpage +``` + +## Viewing the Results after archiving + +With the `ghseet_db` installed, once you start running the Auto Archiver, it will updates the "Archive status" column. + +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png) + +The links are downloaded and archived, and the spreadsheet is updated to the following: + +![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png) + +Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. + +The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive. + +![The archive result for a link in the demo sheet.](../demo-archive.png) diff --git a/docs/source/how_to/logging.md b/docs/source/how_to/logging.md new file mode 100644 index 0000000..d88882d --- /dev/null +++ b/docs/source/how_to/logging.md @@ -0,0 +1,71 @@ +# Keeping Logs + +Auto Archiver's logs can be helpful for debugging problematic archiving processes. This guide shows you how to use the logs to + +## Setting up logging + +Logging settings can be set on the command line or using the orchestration config file ([learn more](../installation/configuration)). A special `logging` section defines the logging options. + +#### Enabling or Disabling Logging + +Logging to the console is enabled by default. If you want to globally disable Auto Archiver's logging, then you can set `enabled: false` in your `logging` config: + +```{code} yaml + +... +logging: + enabled: false +... +``` + +```{note} +This will disable all logs from Auto Archiver, but it does not disable logs for other tools that the Auto Archiver uses (for example: yt-dlp, firefox or ffmpeg). These logs will still appear in your console. +``` + +#### Logging Level + +There are 7 logging levels in total, with 4 commonly used levels. They are: `DEBUG`, `INFO`, `WARNING` and `ERROR`. + +Change the warning level by setting the value in your orchestration config file: + +```{code} yaml +:caption: orchestration.yaml + +... +logging: + level: DEBUG # or INFO / WARNING / ERROR +... +``` + +For normal usage, it is recommended to use the `INFO` level, or if you prefer quieter logs with less information, you can use the `WARNING` level. If you encounter issues with the archiving, then it's recommended to enable the `DEBUG` level. + +```{note} To learn about all logging levels, see the [loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html) +``` + +### Logging to a file + +As default, auto-archiver will log to the console. But if you wish to store your logs for future reference, or you are running the auto-archiver from within code a implementation, then you may with to enable file logging. This can be done by setting the `file:` config value in the logging settings. + +**Rotation:** For file logging, you can choose to 'rotate' your log files (creating new log files) so they do not get too large. Change this by setting the 'rotation' option in your logging settings. For a full list of rotation options, see the [loguru docs](https://loguru.readthedocs.io/en/stable/overview.html#easier-file-logging-with-rotation-retention-compression). + +```{code} yaml +:caption: orchestration.yaml + +logging: + ... + file: /my/log/file.log + rotation: 1 day +``` + +### Full logging example + +The below example logs only `WARNING` logs to the console and to the file `/my/file.log`, rotating that file once per week: + +```{code} yaml +:caption: orchestration.yaml + +logging: + level: WARNING + file: /my/file.log + rotation: 1 week +``` \ No newline at end of file diff --git a/docs/source/how_to/new_config_format.md b/docs/source/how_to/new_config_format.md new file mode 100644 index 0000000..6c12276 --- /dev/null +++ b/docs/source/how_to/new_config_format.md @@ -0,0 +1,127 @@ +# Upgrading to v0.13 + +```{note} This how-to is only relevant for people who used Auto Archiver before February 2025 (versions prior to 0.13). + +If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you. +``` + +Version 0.13 of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications. + +## How do I know if I need to update my configuration format? + +There are two simple ways to check if you need to update your format: + +1. When you try and run auto-archiver using your existing configuration file, you get an error about no feeders or formatters being configured, like: + +```{code} console +AssertionError: No feeders were configured. Make sure to set at least one feeder in +your configuration file or on the command line (using --feeders) +``` + +2. Within your configuration file, you have a `feeder:` option. This is the old format. An example old format: +```{code} yaml + +steps: + feeder: gsheet_feeder +... +``` + +## Updating your configuration file + +To update your configuration file, you can either: + +### 1. Manually edit the configuration file and change the values. + +This is recommended if you want to keep all your old settings. Follow the steps below to change the relevant settings: + +#### a) Feeder & Formatter Steps Settings + +The feeder and formatter settings have been changed from a single string to a list. + +- `steps.feeder (string)` → `steps.feeders (list)` +- `steps.formatter (string)` → `steps.formatters (list)` + +Example: + +```{code} yaml + +steps: + feeder: cli_feeder + ... + formatter: html_formatter + +# the above should be changed to: +steps: + feeders: + - cli_feeder + ... + formatters: + - html_formatter +``` + +```{note} Auto Archiver still only supports one feeder and formatter, but from v0.13 onwards they must be added to the configuration file as a list. +``` + +#### b) Extractor (formerly Archiver) Steps Settings + +With v0.13 of Auto Archiver, `archivers` have been renamed to `extractors` to better reflect what they actually do - extract information from a URL. Change the configuration by renaming: + +- `steps.archivers` → `steps.extractors` + +The names of the actual modules have also changed, so for any extractor modules you have enabled, you will need to rename the `archiver` part to `extractor`. Some examples: + +- `telethon_archiver` → `telethon_extractor` +- `wacz_archiver_enricher` → `wacz_extractor_enricher` +- `wayback_archiver_enricher` → `wayback_extractor_enricher` +- `vk_archiver` → `vk_extractor` + +Additionally, the `youtube_archiver` has been renamed to `generic_extractor` as it is considered the default/fallback extractor. Read more about the [generic extractor](../modules/autogen/extractor/generic_extractor.md). + +Example: +```{code} yaml +steps: + ... + archivers: + - telethon_archiver + - youtube_archiver + - vk_archiver + +# renaming 'archiver' to 'extractor', and renaming the youtube_archiver the above config will become: +steps: + ... + extractors: + - telethon_extractor + - vk_extractor + - generic_extractor + +``` + +#### c) Redundant / Obsolete Modules + +With v0.13 of Auto Archiver, the following modules have been removed and their features have been built in to the generic_extractor. You should remove them from the 'steps' section of your configuration file: + +* `twitter_archiver` - use the `generic_extractor` for general extraction, or the `twitter_api_extractor` for API access. +* `tiktok_archiver` - use the `generic_extractor` to extract TikTok videos. + + +### 2. Auto-generate a new config, then copy over your settings. + +Using this method, you can have Auto Archiver auto-generate a configuration file for you, then you can copy over the desired settings from your old config file. This is probably the easiest method and quickest to setup, but it may require some trial and error as you copy over your settings. + +First, move your existing `orchestration.yaml` file to a different folder or rename it. + +Then, you can generate a `simple` or `full` config using: + +```{code} console +>>> # generate a simple config +>>> auto-archiver +>>> # config will be written to orchestration.yaml +>>> +>>> # generate a full config +>>> auto-archiver --mode=full +>>> +``` + +After this, copy over any settings from your old config to the new config. + + diff --git a/docs/source/index.md b/docs/source/index.md index 6a7f769..74b7969 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -8,10 +8,10 @@ :caption: Contents: Overview -contributing -installation/installation.rst +installation/setup core_modules.md how_to +contributing development/developer_guidelines autoapi/index.rst ``` \ No newline at end of file diff --git a/docs/source/how_to/authentication.md b/docs/source/installation/authentication.md similarity index 69% rename from docs/source/how_to/authentication.md rename to docs/source/installation/authentication.md index 5f3bc48..be30425 100644 --- a/docs/source/how_to/authentication.md +++ b/docs/source/installation/authentication.md @@ -4,22 +4,42 @@ The Authentication framework for auto-archiver allows you to add login details f There are two main use cases for authentication: * Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc. -* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this. +* Some websites use anti-bot systems to block bot-like tools from accessing the website. Adding real login information to auto-archiver can sometimes bypass this. ## The Authentication Config -You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. +You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. Currently, auto-archiver supports the following authentication types: + +**Username & Password:** +- `username`: str - the username to use for login +- `password`: str - the password to use for login + +**API** +- `api_key`: str - the API key to use for login +- `api_secret`: str - the API secret to use for login + +**Cookies** +- `cookie`: str - a cookie string to use for login (specific to this site) +- `cookies_from_browser`: str - load cookies from this browser, for this site only. +- `cookies_file`: str - load cookies from this file, for this site only. + +```{note} + +The Username & Password, and API settings only work with the Generic Extractor. Other modules (like the screenshot enricher) can only use the `cookies` options. Furthermore, many sites can still detect bots and block username/password logins. Twitter/X and YouTube are two prominent ones that block username/password logging. + +One of the 'Cookies' options is recommended for the most robust archiving. +``` ```{code} yaml authentication: # optional file to load authentication information from, for security or multi-system deploy purposes load_from_file: path/to/authentication/file.txt - # optional setting to load cookies from the named browser on the system. + # optional setting to load cookies from the named browser on the system, for **ALL** websites cookies_from_browser: firefox - # optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these + # optional setting to load cookies from a cookies.txt/cookies.jar file, for **ALL** websites. See note below on extracting these cookies_file: path/to/cookies.jar - twitter.com,x.com: + mysite.com: username: myusername password: 123 @@ -29,15 +49,10 @@ authentication: othersite.com: api_key: 123 api_secret: 1234 - -# All available options: - # - username: str - the username to use for login - # - password: str - the password to use for login - # - api_key: str - the API key to use for login - # - api_secret: str - the API secret to use for login - # - cookie: str - a cookie string to use for login (specific to this site) + ``` + ### Recommendations for authentication 1. **Store authentication information separately:** diff --git a/docs/source/installation/configurations.md b/docs/source/installation/configurations.md index 705b6c5..e3aa76e 100644 --- a/docs/source/installation/configurations.md +++ b/docs/source/installation/configurations.md @@ -1,13 +1,18 @@ # Configuration -This section of the documentation provides guidelines for configuring the tool. +The recommended way to configure auto-archiver for first-time users is to [run the Auto Archiver](setup.md#running) and have it auto-generate a default configuration for you. Then, if needed, you can edit the configuration file using one of the following methods. -## Configuring using a file -The recommended way to configure auto-archiver for long-term and deployed projects is a configuration file, typically called `orchestration.yaml`. This is a YAML file containing all the settings for your entire workflow. +## 1. Configuration file -The structure of orchestration file is split into 2 parts: `steps` (what [steps](../flow_overview.md) to use) and `configurations` (settings for different modules), here's a simplification: +The configuration file is typically called `orchestration.yaml` and stored in the `secrets` folder on your desktop. The configuration file contains all the settings for your entire Auto Archiver workflow in one easy-to-find place. + +If you want to have Auto Archiver run with the recommended 'basic' setup, + +### Advanced Configuration + +The structure of orchestration file is split into 2 parts: `steps` (what [steps](../flow_overview.md) to use) and `configurations` (settings for individual modules). A default `orchestration.yaml` will be created for you the first time you run auto-archiver (without any arguments). Here's what it looks like: @@ -21,9 +26,9 @@ A default `orchestration.yaml` will be created for you the first time you run au -## Configuring from the Command Line +## 2. Command Line configuration -You can run auto-archiver directy from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`. +You can run auto-archiver directly from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`. The command line arguments are useful for testing or editing config values and enabling/disabling modules on the fly. When you are happy with your settings, you can store them back in your configuration file by passing the `-s/--store` flag on the command line. diff --git a/docs/source/installation/installation.md b/docs/source/installation/installation.md index fdd3184..eff0720 100644 --- a/docs/source/installation/installation.md +++ b/docs/source/installation/installation.md @@ -1,80 +1,44 @@ -# Installing Auto Archiver +# Installation -```{toctree} -:depth: 1 -:hidden: +There are 3 main ways to use the auto-archiver. We recommend the 'docker' method for most uses. This installs all the requirements in one command. -configurations.md -config_cheatsheet.md -``` - -There are 3 main ways to use the auto-archiver: -1. Easiest: [via docker](#installing-with-docker) +1. Easiest (recommended): [via docker](#installing-with-docker) 2. Local Install: [using pip](#installing-locally-with-pip) 3. Developer Install: [see the developer guidelines](../development/developer_guidelines) - -But **you always need a configuration/orchestration file**, which is where you'll configure where/what/how to archive. Make sure you read [orchestration](#orchestration). - - -## Installing with Docker +## 1. Installing with Docker [![dockeri.co](https://dockerico.blankenship.io/image/bellingcat/auto-archiver)](https://hub.docker.com/r/bellingcat/auto-archiver) -Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag. +Docker works like a virtual machine running inside your computer, making installation simple. You'll need to first set up Docker, and then download the Auto Archiver 'image': -1. Install [docker](https://docs.docker.com/get-docker/) -2. Pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver` -3. Run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down: - 1. `docker run` tells docker to start a new container (an instance of the image) - 2. `--rm` makes sure this container is removed after execution (less garbage locally) - 3. `-v $PWD/secrets:/app/secrets` - your secrets folder - 1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container - 2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use - 3. `/app/secrets` points to the path the docker container where this image can be found - 4. `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage - 1. `-v` same as above, this is a volume instruction - 2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker - 3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file +**a) Download and install docker** -### Example invocations +Go to the [Docker website](https://docs.docker.com/get-docker/) and download right version for your operating system. -The invocations below will run the auto-archiver Docker image using a configuration file that you have specified +**b) Pull the Auto Archiver docker image** + +Open your command line terminal, and copy-paste / type: ```bash -# all the configurations come from ./secrets/orchestration.yaml -docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml -# uses the same configurations but for another google docs sheet -# with a header on row 2 and with some different column names -# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided -docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}' -# all the configurations come from orchestration.yaml and specifies that s3 files should be private -docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1 +docker pull bellingcat/auto-archiver ``` -## Installing Locally with Pip +This will download the docker image, which may take a while. + +That's it, all done! You're now ready to set up [your configuration file](configurations.md). Or, if you want to use the recommended defaults, then you can [run Auto Archiver immediately](setup.md#running-a-docker-install). + +------------ + +## 2. Installing Locally with Pip 1. Make sure you have python 3.10 or higher installed 2. Install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver` 3. Test it's installed with `auto-archiver --help` -4. Install other local dependency requirements (for ) -5. Run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise +4. Install other local dependency requirements (for example `ffmpeg`, `firefox`) -### Example invocations - -Once all your [local requirements](#installing-local-requirements) are correctly installed, the - -```bash -# all the configurations come from ./secrets/orchestration.yaml -auto-archiver --config secrets/orchestration.yaml -# uses the same configurations but for another google docs sheet -# with a header on row 2 and with some different column names -# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided -auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}' -# all the configurations come from orchestration.yaml and specifies that s3 files should be private -auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1 -``` +After this, you're ready to set up your [your configuration file](configurations.md), or if you want to use the recommended defaults, then you can [run Auto Archiver immediately](setup.md#running-a-local-install). ### Installing Local Requirements diff --git a/docs/source/installation/requirements.md b/docs/source/installation/requirements.md new file mode 100644 index 0000000..b820272 --- /dev/null +++ b/docs/source/installation/requirements.md @@ -0,0 +1,14 @@ +# Requirements + +Using the Auto Archiver is very simple, but ideally you have some familiarity with using the command line to run programs. ([Command line crash course](https://developer.mozilla.org/en-US/docs/Learn_web_development/Getting_started/Environment_setup/Command_line)). + +### System Requirements + +* Auto Archiver works on any Windows, macOS and Linux computer +* If you're using the **local install** method, then you should make sure to have python3.10+ installed + +### Storage Requirements + +By default, Auto Archiver uses your local computer storage for any downloaded media (videos, images etc.). If you're downloading large files, this may take up a lot of your local computer's space (more than 5GB of space). + +If your storage space is limited, then you may want to set up an [alternative storage method](../modules/storage.md) for your media. \ No newline at end of file diff --git a/docs/source/installation/setup.md b/docs/source/installation/setup.md new file mode 100644 index 0000000..8d1a6f5 --- /dev/null +++ b/docs/source/installation/setup.md @@ -0,0 +1,76 @@ +# Getting Started + +```{toctree} +:maxdepth: 1 +:hidden: + +installation.md +configurations.md +authentication.md +requirements.md +config_cheatsheet.md +``` + +## Getting Started + +To get started with Auto Archiver, there are 3 main steps you need to complete. + +1. [Install Auto Archiver](installation.md) +2. [Setup up your configuration](configurations.md) (if you are ok with the default settings, you can skip this step) +3. Run the archiving process + +The way you run the Auto Archiver depends on how you installed it (docker install or local install) + +### Running a Docker Install + +If you installed Auto Archiver using docker, open up your terminal, and copy-paste / type the following command: + +```bash +docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver + ``` + +breaking this command down: + 1. `docker run` tells docker to start a new container (an instance of the image) + 2. `--rm` makes sure this container is removed after execution (less garbage locally) + 3. `-v $PWD/secrets:/app/secrets` - your secrets folder with settings + 1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container + 2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use + 3. `/app/secrets` points to the path the docker container where this image can be found + 4. `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage + 1. `-v` same as above, this is a volume instruction + 2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker + 3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file + +### Example invocations + +The invocations below will run the auto-archiver Docker image using a configuration file that you have specified + +```bash +# Have auto-archiver run with the default settings, generating a settings file in ./secrets/orchestration.yaml +docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver + +# uses the same configuration, but with the `gsheet_feeder`, a header on row 2 and with some different column names +# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided +docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}' +# Runs auto-archiver for the first time, but in 'full' mode, enabling all modules to get a full settings file +docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --mode full +``` + +------------ + +### Running a Local Install + +### Example invocations + +Once all your [local requirements](#installing-local-requirements) are correctly installed, the + +```bash +# all the configurations come from ./secrets/orchestration.yaml +auto-archiver --config secrets/orchestration.yaml +# uses the same configurations but for another google docs sheet +# with a header on row 2 and with some different column names +# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided +auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}' +# all the configurations come from orchestration.yaml and specifies that s3 files should be private +auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1 +``` diff --git a/docs/source/modules/database.md b/docs/source/modules/database.md index 9acecda..3ecd2e2 100644 --- a/docs/source/modules/database.md +++ b/docs/source/modules/database.md @@ -8,7 +8,7 @@ The default (enabled) databases are the CSV Database and the Console Database. ``` ```{toctree} -:depth: 1 +:maxdepth: 1 :hidden: :glob: autogen/database/* diff --git a/docs/source/modules/enricher.md b/docs/source/modules/enricher.md index 30568c3..a145a1d 100644 --- a/docs/source/modules/enricher.md +++ b/docs/source/modules/enricher.md @@ -7,7 +7,7 @@ Enricher modules are used to add additional information to the items that have ``` ```{toctree} -:depth: 1 +:maxdepth: 1 :hidden: :glob: autogen/enricher/* diff --git a/docs/source/modules/extractor.md b/docs/source/modules/extractor.md index 7f218fb..e6375db 100644 --- a/docs/source/modules/extractor.md +++ b/docs/source/modules/extractor.md @@ -4,14 +4,14 @@ Extractor modules are used to extract the content of a given URL. Typically, one Extractors that are able to extract content from a wide range of websites include: 1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library. -2. Wayback Machine Extractor: sends pages to the Waygback machine for archiving, and stores the link. +2. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the link. 3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format. ```{include} autogen/extractor.md ``` ```{toctree} -:depth: 1 +:maxdepth: 1 :hidden: :glob: autogen/extractor/* diff --git a/docs/source/modules/feeder.md b/docs/source/modules/feeder.md index ce5f7ca..dcac749 100644 --- a/docs/source/modules/feeder.md +++ b/docs/source/modules/feeder.md @@ -1,8 +1,8 @@ # Feeder Modules -Feeder modules are used to feed URLs into the `auto-archiver` for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line. +Feeder modules are used to feed URLs into the Auto Archiver for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line. -The default feeder is the command line feeder (`cli_feeder`), which allows you to input URLs directly into the `auto-archiver` from the command line. +The default feeder is the command line feeder (`cli_feeder`), which allows you to input URLs directly into `auto-archiver` from the command line. Command line feeder usage: ```{code} bash @@ -13,7 +13,7 @@ auto-archiver [options] -- URL1 URL2 ... ``` ```{toctree} -:depth: 1 +:maxdepth: 1 :glob: :hidden: autogen/feeder/* diff --git a/docs/source/modules/formatter.md b/docs/source/modules/formatter.md index b7ae77e..7d5713c 100644 --- a/docs/source/modules/formatter.md +++ b/docs/source/modules/formatter.md @@ -6,7 +6,7 @@ Formatter modules are used to format the data extracted from a URL into a specif ``` ```{toctree} -:depth: 1 +:maxdepth: 1 :hidden: :glob: autogen/formatter/* diff --git a/docs/source/modules/storage.md b/docs/source/modules/storage.md index 427213c..d4a2f99 100644 --- a/docs/source/modules/storage.md +++ b/docs/source/modules/storage.md @@ -8,7 +8,7 @@ The default is to store the files downloaded (e.g. images, videos) in a local di ``` ```{toctree} -:depth: 1 +:maxdepth: 1 :hidden: :glob: autogen/storage/* diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 50ea3ff..f18ad13 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -51,7 +51,6 @@ class BaseModule(ABC): def config_setup(self, config: dict): authentication = config.get('authentication', {}) - # this is important. Each instance is given its own deepcopied config, so modules cannot # change values to affect other modules config = deepcopy(config) @@ -86,11 +85,13 @@ class BaseModule(ABC): * api_key: str - the API key to use for login\n * api_secret: str - the API secret to use for login\n * cookie: str - a cookie string to use for login (specific to this site)\n + * cookies_file: str - the path to a cookies file to use for login (specific to this site)\n + * cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n """ # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code? - site = UrlUtil.domain_for_url(site) + site = UrlUtil.domain_for_url(site).lstrip("www.") # add the 'www' version of the site to the list of sites to check authdict = {} @@ -116,17 +117,30 @@ class BaseModule(ABC): # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts')) ytdlp_opts = getattr(parse_options(args), 'ydl_opts') return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar + + get_cookiejar_options = None - # get the cookies jar, prefer the browser cookies than the file - if 'cookies_from_browser' in self.authentication: + # order of priority: + # 1. cookies_from_browser setting in site config + # 2. cookies_file setting in site config + # 3. cookies_from_browser setting in global config + # 4. cookies_file setting in global config + + if 'cookies_from_browser' in authdict: + get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']] + elif 'cookies_file' in authdict: + get_cookiejar_options = ['--cookies', authdict['cookies_file']] + elif 'cookies_from_browser' in self.authentication: authdict['cookies_from_browser'] = self.authentication['cookies_from_browser'] - if extract_cookies: - authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']]) + get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']] elif 'cookies_file' in self.authentication: authdict['cookies_file'] = self.authentication['cookies_file'] - if extract_cookies: - authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']]) + get_cookiejar_options = ['--cookies', self.authentication['cookies_file']] + + if get_cookiejar_options: + authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options) + return authdict def repr(self): diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index c3bc706..66d2ffb 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -7,6 +7,7 @@ flexible setup in various environments. import argparse from ruamel.yaml import YAML, CommentedMap, add_representer +import json from loguru import logger @@ -17,10 +18,12 @@ from typing import Any, List, Type, Tuple _yaml: YAML = YAML() +DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml" + EMPTY_CONFIG = _yaml.load(""" # Auto Archiver Configuration -# Steps are the modules that will be run in the order they are defined +# Steps are the modules that will be run in the order they are defined steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \ """ @@ -52,6 +55,57 @@ logging: """) # note: 'logging' is explicitly added above in order to better format the config file + +# Arg Parse Actions/Classes +class AuthenticationJsonParseAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + + try: + auth_dict = json.loads(values) + setattr(namespace, self.dest, auth_dict) + except json.JSONDecodeError as e: + raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}") + + def load_from_file(path): + try: + with open(path, 'r') as f: + try: + auth_dict = json.load(f) + except json.JSONDecodeError: + f.seek(0) + # maybe it's yaml, try that + auth_dict = _yaml.load(f) + if auth_dict.get('authentication'): + auth_dict = auth_dict['authentication'] + auth_dict['load_from_file'] = path + return auth_dict + except: + return None + + if isinstance(auth_dict, dict) and auth_dict.get('from_file'): + auth_dict = load_from_file(auth_dict['from_file']) + elif isinstance(auth_dict, str): + # if it's a string + auth_dict = load_from_file(auth_dict) + + if not isinstance(auth_dict, dict): + raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") + global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file'] + for key, auth in auth_dict.items(): + if key in global_options: + continue + if not isinstance(key, str) or not isinstance(auth, dict): + raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}") + + setattr(namespace, self.dest, auth_dict) + + +class UniqueAppendAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + for value in values: + if value not in getattr(namespace, self.dest): + getattr(namespace, self.dest).append(value) + class DefaultValidatingParser(argparse.ArgumentParser): def error(self, message): @@ -82,6 +136,7 @@ class DefaultValidatingParser(argparse.ArgumentParser): return super().parse_known_args(args, namespace) +# Config Utils def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict: dotdict = {} @@ -153,8 +208,8 @@ def read_yaml(yaml_filename: str) -> CommentedMap: pass if not config: - config = EMPTY_CONFIG - + config = deepcopy(EMPTY_CONFIG) + return config # TODO: make this tidier/find a way to notify of which keys should not be stored @@ -170,4 +225,7 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: - _yaml.dump(config_to_save, outf) \ No newline at end of file + _yaml.dump(config_to_save, outf) + +def is_valid_config(config: CommentedMap) -> bool: + return config and config != EMPTY_CONFIG \ No newline at end of file diff --git a/src/auto_archiver/core/enricher.py b/src/auto_archiver/core/enricher.py index 45e75d7..a862223 100644 --- a/src/auto_archiver/core/enricher.py +++ b/src/auto_archiver/core/enricher.py @@ -13,7 +13,7 @@ from abc import abstractmethod from auto_archiver.core import Metadata, BaseModule class Enricher(BaseModule): - """Base classes and utilities for enrichers in the Auto-Archiver system. + """Base classes and utilities for enrichers in the Auto Archiver system. Enricher modules must implement the `enrich` method to define their behavior. """ diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 9556621..5442e71 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -134,7 +134,6 @@ class LazyBaseModule: """ name: str - type: list description: str path: str module_factory: ModuleFactory @@ -148,6 +147,10 @@ class LazyBaseModule: self.path = path self.module_factory = factory + @property + def type(self): + return self.manifest['type'] + @property def entry_point(self): if not self._entry_point and not self.manifest['entry_point']: @@ -183,10 +186,9 @@ class LazyBaseModule: try: manifest.update(ast.literal_eval(f.read())) except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e: - logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}") + raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}") self._manifest = manifest - self.type = manifest['type'] self._entry_point = manifest['entry_point'] self.description = manifest['description'] self.version = manifest['version'] @@ -254,7 +256,7 @@ class LazyBaseModule: instance.module_factory = self.module_factory # merge the default config with the user config - default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) + default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v) config[self.name] = default_config | config.get(self.name, {}) instance.config_setup(config) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 10d9215..274fa9e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -6,95 +6,31 @@ from __future__ import annotations from typing import Generator, Union, List, Type, TYPE_CHECKING -from urllib.parse import urlparse -from ipaddress import ip_address -from copy import copy import argparse import os import sys -import json from tempfile import TemporaryDirectory import traceback +from copy import copy from rich_argparse import RichHelpFormatter - +from loguru import logger from .metadata import Metadata, Media from auto_archiver.version import __version__ -from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser +from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \ + DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .consts import MODULE_TYPES -from loguru import logger +from auto_archiver.utils.url import check_url_or_raise if TYPE_CHECKING: from .base_module import BaseModule from .module import LazyBaseModule -DEFAULT_CONFIG_FILE = "orchestration.yaml" - - -class JsonParseAction(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - try: - setattr(namespace, self.dest, json.loads(values)) - except json.JSONDecodeError as e: - raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}") - - -class AuthenticationJsonParseAction(JsonParseAction): - def __call__(self, parser, namespace, values, option_string=None): - super().__call__(parser, namespace, values, option_string) - auth_dict = getattr(namespace, self.dest) - - def load_from_file(path): - try: - with open(path, 'r') as f: - try: - auth_dict = json.load(f) - except json.JSONDecodeError: - f.seek(0) - # maybe it's yaml, try that - auth_dict = _yaml.load(f) - if auth_dict.get('authentication'): - auth_dict = auth_dict['authentication'] - auth_dict['load_from_file'] = path - return auth_dict - except: - return None - - if isinstance(auth_dict, dict) and auth_dict.get('from_file'): - auth_dict = load_from_file(auth_dict['from_file']) - elif isinstance(auth_dict, str): - # if it's a string - auth_dict = load_from_file(auth_dict) - - if not isinstance(auth_dict, dict): - raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") - global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file'] - for key, auth in auth_dict.items(): - if key in global_options: - continue - if not isinstance(key, str) or not isinstance(auth, dict): - raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}") - - # extract out concatenated sites - for key, val in copy(auth_dict).items(): - if "," in key: - for site in key.split(","): - auth_dict[site] = val - del auth_dict[key] - - setattr(namespace, self.dest, auth_dict) - - -class UniqueAppendAction(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - for value in values: - if value not in getattr(namespace, self.dest): - getattr(namespace, self.dest).append(value) - - +class SetupError(ValueError): + pass class ArchivingOrchestrator: # instance variables @@ -163,7 +99,7 @@ class ArchivingOrchestrator: # TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser' # but should we add them? Or should we just add them to the 'complete' parser? - if yaml_config != EMPTY_CONFIG: + if is_valid_config(yaml_config): # only load the modules enabled in config # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] @@ -189,7 +125,13 @@ class ArchivingOrchestrator: yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) else: # load all modules, they're not using the 'simple' mode - self.add_individual_module_args(self.module_factory.available_modules(), parser) + all_modules = self.module_factory.available_modules() + # add all the modules to the steps + for module in all_modules: + for module_type in module.type: + yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) + + self.add_individual_module_args(all_modules, parser) parser.set_defaults(**to_dot_notation(yaml_config)) @@ -198,6 +140,9 @@ class ArchivingOrchestrator: # merge the new config with the old one config = merge_dicts(vars(parsed), yaml_config) + # set up the authentication dict as needed + config = self.setup_authentication(config) + # clean out args from the base_parser that we don't want in the config for key in vars(basic_config): config.pop(key, None) @@ -286,14 +231,20 @@ class ArchivingOrchestrator: self.basic_parser.exit() def setup_logging(self, config): + + logging_config = config['logging'] + + if logging_config.get('enabled', True) is False: + # disabled logging settings, they're set on a higher level + logger.disable('auto_archiver') + return + # setup loguru logging try: logger.remove(0) # remove the default logger except ValueError: pass - logging_config = config['logging'] - # add other logging info if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 self.logger_id = logger.add(sys.stderr, level=logging_config['level']) @@ -312,27 +263,25 @@ class ArchivingOrchestrator: step_items = [] modules_to_load = modules_by_type[f"{module_type}s"] - assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" + if not modules_to_load: + raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)") def check_steps_ok(): if not len(step_items): - logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.") if len(modules_to_load): - logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}") - exit() + logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}") + raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.") + if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1: - logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") - exit() + raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") for module in modules_to_load: if module == 'cli_feeder': - # pseudo module, don't load it + # cli_feeder is a pseudo module, it just takes the command line args for [URLS] urls = self.config['urls'] if not urls: - logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") - exit() - # cli_feeder is a pseudo module, it just takes the command line args + raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") def feed(self) -> Generator[Metadata]: for url in urls: @@ -352,13 +301,14 @@ class ArchivingOrchestrator: if module in invalid_modules: continue + try: loaded_module: BaseModule = self.module_factory.get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") if module_type == 'extractor' and loaded_module.name == module: loaded_module.cleanup() - exit() + raise e if not loaded_module: invalid_modules.append(module) @@ -372,7 +322,7 @@ class ArchivingOrchestrator: def load_config(self, config_file: str) -> dict: if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE: logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") - exit() + raise FileNotFoundError(f"Configuration file {config_file} not found") return read_yaml(config_file) @@ -437,8 +387,12 @@ class ArchivingOrchestrator: If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately. To test configurations, without loading any modules you can also first call 'setup_configs' """ - self.setup(args) - return self.feed() + try: + self.setup(args) + return self.feed() + except Exception as e: + logger.error(e) + exit(1) def cleanup(self) -> None: logger.info("Cleaning up") @@ -503,8 +457,8 @@ class ArchivingOrchestrator: original_url = result.get_url().strip() try: - self.assert_valid_url(original_url) - except AssertionError as e: + check_url_or_raise(original_url) + except ValueError as e: logger.error(f"Error archiving URL {original_url}: {e}") raise e @@ -564,26 +518,27 @@ class ArchivingOrchestrator: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") return result + - def assert_valid_url(self, url: str) -> bool: + def setup_authentication(self, config: dict) -> dict: """ - Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes. + Setup authentication for all modules that require it + + Split up strings into multiple sites if they are comma separated """ - assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme" - parsed = urlparse(url) - assert parsed.scheme in ["http", "https"], f"Invalid URL scheme" - assert parsed.hostname, f"Invalid URL hostname" - assert parsed.hostname != "localhost", f"Invalid URL" + authentication = config.get('authentication', {}) - try: # special rules for IP addresses - ip = ip_address(parsed.hostname) - except ValueError: pass - else: - assert ip.is_global, f"Invalid IP used" - assert not ip.is_reserved, f"Invalid IP used" - assert not ip.is_link_local, f"Invalid IP used" - assert not ip.is_private, f"Invalid IP used" + # extract out concatenated sites + for key, val in copy(authentication).items(): + if "," in key: + for site in key.split(","): + site = site.strip() + authentication[site] = val + del authentication[key] + + config['authentication'] = authentication + return config # Helper Properties diff --git a/src/auto_archiver/core/validators.py b/src/auto_archiver/core/validators.py index b868ddf..0d3f01f 100644 --- a/src/auto_archiver/core/validators.py +++ b/src/auto_archiver/core/validators.py @@ -1,6 +1,7 @@ # used as validators for config values. Should raise an exception if the value is invalid. from pathlib import Path import argparse +import json def example_validator(value): if "example" not in value: @@ -16,4 +17,7 @@ def positive_number(value): def valid_file(value): if not Path(value).is_file(): raise argparse.ArgumentTypeError(f"File '{value}' does not exist.") - return value \ No newline at end of file + return value + +def json_loader(cli_val): + return json.loads(cli_val) \ No newline at end of file diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index 8359174..e67b31a 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -1,5 +1,5 @@ { - "name": "Auto-Archiver API Database", + "name": "Auto Archiver API Database", "type": ["database"], "entry_point": "api_db::AAApiDb", "requires_setup": True, @@ -39,7 +39,7 @@ }, }, "description": """ - Provides integration with the Auto-Archiver API for querying and storing archival data. + Provides integration with the Auto Archiver API for querying and storing archival data. ### Features - **API Integration**: Supports querying for existing archives and submitting results. @@ -49,6 +49,6 @@ - **Optional Storage**: Archives results conditionally based on configuration. ### Setup -Requires access to an Auto-Archiver API instance and a valid API token. +Requires access to an Auto Archiver API instance and a valid API token. """, } diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 72fe3e0..08d2af7 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -280,6 +280,7 @@ class GenericExtractor(Extractor): # set up auth auth = self.auth_for_site(url, extract_cookies=False) + # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file if auth: if 'username' in auth and 'password' in auth: @@ -290,11 +291,11 @@ class GenericExtractor(Extractor): logger.debug(f'Using provided auth cookie for {url}') yt_dlp.utils.std_headers['cookie'] = auth['cookie'] elif 'cookies_from_browser' in auth: - logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}') + logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}') ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser'] elif 'cookies_file' in auth: - logger.debug(f'Using cookies from file {self.cookie_file} for {url}') - ydl_options['cookiesfile'] = auth['cookies_file'] + logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}') + ydl_options['cookiefile'] = auth['cookies_file'] ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index 77026ea..130b9f6 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -15,7 +15,8 @@ "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"}, "service_account": { "default": "secrets/service_account.json", - "help": "service account JSON file path", + "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html", + "required": True, }, "columns": { "default": { @@ -34,16 +35,16 @@ "wacz": "wacz", "replaywebpage": "replaywebpage", }, - "help": "names of columns in the google sheet (stringified JSON object)", - "type": "auto_archiver.utils.json_loader", + "help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting", + "type": "json_loader", }, "allow_worksheets": { "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed", }, "block_worksheets": { "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed", + "help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed", }, "use_sheet_names_in_stored_paths": { "default": True, @@ -64,8 +65,10 @@ - Ensures only rows with valid URLs and unprocessed statuses are included for archival. - Supports organizing stored files into folder paths based on sheet and worksheet names. - ### Notes - - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - - Create the sheet using the template provided in the docs. + ### Setup + - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`. + To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html). + - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive. + - Customize the column names in your Google sheet using the `columns` configuration. """, } diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 8612d02..ea724e7 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder): def setup(self) -> None: self.gsheets_client = gspread.service_account(filename=self.service_account) # TODO mv to validators - assert self.sheet or self.sheet_id, ( - "You need to define either a 'sheet' name or a 'sheet_id' in your manifest." - ) + if not self.sheet and not self.sheet_id: + raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.") def open_sheet(self): if self.sheet: diff --git a/src/auto_archiver/modules/telethon_extractor/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py index e16d9db..458428b 100644 --- a/src/auto_archiver/modules/telethon_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -18,7 +18,7 @@ "channel_invites": { "default": {}, "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", - "type": "auto_archiver.utils.json_loader", + "type": "json_loader", } }, "description": """ diff --git a/src/auto_archiver/modules/wacz_enricher/__init__.py b/src/auto_archiver/modules/wacz_enricher/__init__.py deleted file mode 100644 index 686b8d8..0000000 --- a/src/auto_archiver/modules/wacz_enricher/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .wacz_enricher import WaczExtractorEnricher diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/__init__.py b/src/auto_archiver/modules/wacz_extractor_enricher/__init__.py new file mode 100644 index 0000000..b9a53e3 --- /dev/null +++ b/src/auto_archiver/modules/wacz_extractor_enricher/__init__.py @@ -0,0 +1 @@ +from .wacz_extractor_enricher import WaczExtractorEnricher diff --git a/src/auto_archiver/modules/wacz_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py similarity index 95% rename from src/auto_archiver/modules/wacz_enricher/__manifest__.py rename to src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py index bebfc9e..b8d6201 100644 --- a/src/auto_archiver/modules/wacz_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py @@ -1,7 +1,7 @@ { - "name": "WACZ Enricher", + "name": "WACZ Enricher (and Extractor)", "type": ["enricher", "extractor"], - "entry_point": "wacz_enricher::WaczExtractorEnricher", + "entry_point": "wacz_extractor_enricher::WaczExtractorEnricher", "requires_setup": True, "dependencies": { "python": [ diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py similarity index 100% rename from src/auto_archiver/modules/wacz_enricher/wacz_enricher.py rename to src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py index 4832265..38a5610 100644 --- a/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py @@ -1,5 +1,5 @@ { - "name": "Wayback Machine Enricher", + "name": "Wayback Machine Enricher (and Extractor)", "type": ["enricher", "extractor"], "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher", "requires_setup": True, diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 108deae..e46a93d 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -59,10 +59,6 @@ def random_str(length: int = 32) -> str: return str(uuid.uuid4()).replace("-", "")[:length] -def json_loader(cli_val): - return json.loads(cli_val) - - def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str: hash = hash_algo() with open(filename, "rb") as f: diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 40884da..061f4aa 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -1,5 +1,6 @@ import re from urllib.parse import urlparse, urlunparse +from ipaddress import ip_address AUTHWALL_URLS = [ @@ -7,6 +8,43 @@ AUTHWALL_URLS = [ re.compile(r"https:\/\/www\.instagram\.com"), # instagram ] + +def check_url_or_raise(url: str) -> bool | ValueError: + """ + Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes. + """ + + + if not (url.startswith("http://") or url.startswith("https://")): + raise ValueError(f"Invalid URL scheme for url {url}") + + parsed = urlparse(url) + if not parsed.hostname: + raise ValueError(f"Invalid URL hostname for url {url}") + + if parsed.hostname == "localhost": + raise ValueError(f"Localhost URLs cannot be parsed for security reasons (for url {url})") + + if parsed.scheme not in ["http", "https"]: + raise ValueError(f"Invalid URL scheme, only http and https supported (for url {url})") + + try: # special rules for IP addresses + ip = ip_address(parsed.hostname) + except ValueError: + pass + + else: + if not ip.is_global: + raise ValueError(f"IP address {ip} is not globally reachable") + if ip.is_reserved: + raise ValueError(f"Reserved IP address {ip} used") + if ip.is_link_local: + raise ValueError(f"Link-local IP address {ip} used") + if ip.is_private: + raise ValueError(f"Private IP address {ip} used") + + return True + def domain_for_url(url: str) -> str: """ SECURITY: parse the domain using urllib to avoid any potential security issues diff --git a/tests/enrichers/test_wacz_enricher.py b/tests/enrichers/test_wacz_enricher.py index d55733d..ceab83b 100644 --- a/tests/enrichers/test_wacz_enricher.py +++ b/tests/enrichers/test_wacz_enricher.py @@ -18,7 +18,7 @@ def wacz_enricher(setup_module, mock_binary_dependencies): "socks_proxy_port": None, "proxy_server": None, } - wacz = setup_module("wacz_enricher", configs) + wacz = setup_module("wacz_extractor_enricher", configs) return wacz diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py index 54f4d9c..33f35b7 100644 --- a/tests/extractors/test_generic_extractor.py +++ b/tests/extractors/test_generic_extractor.py @@ -68,7 +68,7 @@ class TestGenericExtractor(TestExtractorBase): "twitter.com/bellingcat/status/123", "https://www.youtube.com/watch?v=1" ]) - def test_download_nonexistend_media(self, make_item, url): + def test_download_nonexistent_media(self, make_item, url): """ Test to make sure that the extractor doesn't break on non-existend posts/media diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index 7c5f501..ef150d1 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -9,7 +9,7 @@ from auto_archiver.core import Metadata, Feeder def test_setup_without_sheet_and_sheet_id(setup_module, mocker): # Ensure setup() raises AssertionError if neither sheet nor sheet_id is set. mocker.patch("gspread.service_account") - with pytest.raises(AssertionError): + with pytest.raises(ValueError): setup_module( "gsheet_feeder", {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, diff --git a/tests/test_implementation.py b/tests/test_implementation.py index 85fc448..51e9d79 100644 --- a/tests/test_implementation.py +++ b/tests/test_implementation.py @@ -6,7 +6,9 @@ from auto_archiver.__main__ import main @pytest.fixture def orchestration_file_path(tmp_path): - return (tmp_path / "example_orch.yaml").as_posix() + folder = tmp_path / "secrets" + folder.mkdir(exist_ok=True) + return (folder / "example_orch.yaml").as_posix() @pytest.fixture def orchestration_file(orchestration_file_path): @@ -28,6 +30,7 @@ def autoarchiver(tmp_path, monkeypatch, request): logger.add(sys.stderr) request.addfinalizer(cleanup) + (tmp_path / "secrets").mkdir(exist_ok=True) # change dir to tmp_path monkeypatch.chdir(tmp_path) @@ -66,6 +69,7 @@ def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path): # monkey patch to change the current working directory, so that we don't use the user's real config file monkeypatch.chdir(tmp_path) + (tmp_path / "secrets").mkdir(exist_ok=True) with monkeypatch.context() as m: m.setattr(sys, "argv", ["auto-archiver"]) with pytest.raises(SystemExit): diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 301e4d9..752adb8 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.version import __version__ from auto_archiver.core.config import read_yaml, store_yaml - +from auto_archiver.core import Metadata TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" TEST_MODULES = "tests/data/test_modules/" @@ -160,4 +160,26 @@ def test_load_settings_for_module_from_commandline(orchestrator, test_args): assert len(orchestrator.feeders) == 1 assert orchestrator.feeders[0].name == "gsheet_feeder" - assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123" \ No newline at end of file + assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123" + + +def test_multiple_orchestrator(test_args): + + o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"] + o1 = ArchivingOrchestrator() + + with pytest.raises(ValueError) as exit_error: + # this should fail because the gsheet_feeder requires a sheet_id / sheet + o1.setup(o1_args) + + + + o2_args = test_args + ["--feeders", "example_module"] + o2 = ArchivingOrchestrator() + o2.setup(o2_args) + + assert o2.feeders[0].name == "example_module" + + output: Metadata = list(o2.feed()) + assert len(output) == 1 + assert output[0].get_url() == "https://example.com" \ No newline at end of file