From 2650cd8fb2baab75b1ce925c89c867146c2cb5b4 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 22:51:04 +0000 Subject: [PATCH] Use a script to auto-generate documentation for the core modules from the manifest file --- .gitignore | 1 + README.md | 6 +- docs/scripts/__init__.py | 1 + docs/scripts/scripts.py | 79 +++++++++++++++++++ docs/source/conf.py | 21 +++-- docs/source/core_modules.md | 26 ++++++ docs/source/core_modules.rst | 11 --- docs/source/development/testing.md | 20 +++-- docs/source/index.md | 16 ++++ docs/source/index.rst | 18 ----- docs/source/installation/installation.md | 18 ++--- docs/source/modules/database.md | 8 ++ docs/source/modules/enricher.md | 7 ++ docs/source/modules/extractor.md | 11 +++ docs/source/modules/feeder.md | 8 ++ docs/source/modules/formatter.md | 6 ++ docs/source/modules/storage.md | 8 ++ .../modules/wacz_enricher/__manifest__.py | 2 +- .../__manifest__.py | 2 +- 19 files changed, 216 insertions(+), 53 deletions(-) create mode 100644 docs/scripts/__init__.py create mode 100644 docs/scripts/scripts.py create mode 100644 docs/source/core_modules.md delete mode 100644 docs/source/core_modules.rst create mode 100644 docs/source/index.md delete mode 100644 docs/source/index.rst create mode 100644 docs/source/modules/database.md create mode 100644 docs/source/modules/enricher.md create mode 100644 docs/source/modules/extractor.md create mode 100644 docs/source/modules/feeder.md create mode 100644 docs/source/modules/formatter.md create mode 100644 docs/source/modules/storage.md diff --git a/.gitignore b/.gitignore index 7c6bf08..701de43 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,4 @@ archived/ dist* docs/_build/ docs/source/autoapi/ +docs/source/modules/autogen/ diff --git a/README.md b/README.md index b279a50..bffa9e0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Python tool to automatically archive social media posts, videos, and images from ## Installation -For full For instructions on how to install auto-archiver, view the [Installation Guide](docs/source/installation.md) +For full instructions on how to install auto-archiver, view the [Installation Guide](docs/source/installation.md) Quick run using docker: @@ -65,7 +65,9 @@ auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,u ``` Here's the complete workflow that the auto-archiver goes through: -```{mermaid} + +```mermaid + graph TD s((start)) --> F(fa:fa-table Feeder) F -->|get and clean URL| D1{fa:fa-database Database} diff --git a/docs/scripts/__init__.py b/docs/scripts/__init__.py new file mode 100644 index 0000000..ba9737c --- /dev/null +++ b/docs/scripts/__init__.py @@ -0,0 +1 @@ +from scripts import generate_module_docs \ No newline at end of file diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py new file mode 100644 index 0000000..34acd48 --- /dev/null +++ b/docs/scripts/scripts.py @@ -0,0 +1,79 @@ +# iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table +from pathlib import Path +from auto_archiver.core.module import available_modules +from auto_archiver.core.base_module import BaseModule + +MODULES_FOLDER = Path(__file__).parent.parent.parent.parent / "src" / "auto_archiver" / "modules" +SAVE_FOLDER = Path(__file__).parent.parent / "source" / "modules" / "autogen" + +type_color = { + 'feeder': "[feeder](/core_modules.md#feeder-modules)", + 'extractor': "[extractor](/core_modules.md#extractor-modules)", + 'enricher': "[enricher](/core_modules.md#enricher-modules)", + 'database': "[database](/core_modules.md#database-modules)", + 'storage': "[storage](/core_modules.md#storage-modules)", + 'formatter': "[formatter](/core_modules.md#formatter-modules)", +} + + +def generate_module_docs(): + SAVE_FOLDER.mkdir(exist_ok=True) + modules_by_type = {} + + for module in available_modules(with_manifest=True): + # generate the markdown file from the __manifest__.py file. + + manifest = module.manifest + for type in manifest['type']: + modules_by_type.setdefault(type, []).append(module) + + description = "\n".join(l.lstrip() for l in manifest['description'].split("\n")) + types = ", ".join(type_color[t] for t in manifest['type']) + readme_str = f""" +# {manifest['name']} +```{{admonition}} Module type + +{types} +``` +{description} +""" + if manifest['configs']: + readme_str += "\n## Configuration Options\n" + readme_str += "| Option | Description | Default | Type |\n" + readme_str += "| --- | --- | --- | --- |\n" + for key, value in manifest['configs'].items(): + type = value.get('type', 'string') + if type == 'auto_archiver.utils.json_loader': + value['type'] = 'json' + elif type == 'str': + type = "string" + + help = "**Required**. " if value.get('required', False) else "Optional. " + help += value.get('help', '') + readme_str += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n" + + + # make type folder if it doesn't exist + + + with open(SAVE_FOLDER / f"{module.name}.md", "w") as f: + print("writing", SAVE_FOLDER) + f.write(readme_str) + + generate_index(modules_by_type) + +def generate_index(modules_by_type): + readme_str = "" + for type in BaseModule.MODULE_TYPES: + modules = modules_by_type.get(type, []) + module_str = f"## {type.capitalize()} Modules\n" + for module in modules: + module_str += f"\n[{module.manifest['name']}](/modules/autogen/{module.name}.md)\n" + with open(SAVE_FOLDER / f"{type}.md", "w") as f: + print("writing", SAVE_FOLDER / f"{type}.md") + f.write(module_str) + readme_str += module_str + + with open(SAVE_FOLDER / "module_list.md", "w") as f: + print("writing", SAVE_FOLDER / "module_list.md") + f.write(readme_str) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7aac1ec..54988ed 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,20 +1,29 @@ # Configuration file for the Sphinx documentation builder. # https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Project information ----------------------------------------------------- +import sys +import os from importlib.metadata import metadata +sys.path.insert(0, os.path.abspath('../scripts')) +from scripts import generate_module_docs + +# -- Project Hooks ----------------------------------------------------------- +# convert the module __manifest__.py files into markdown files +generate_module_docs() + + +# -- Project information ----------------------------------------------------- package_metadata = metadata("auto-archiver") project = package_metadata["name"] authors = package_metadata["authors"] release = package_metadata["version"] - +language = 'en' # -- General configuration --------------------------------------------------- extensions = [ "autoapi.extension", # Generate API documentation from docstrings + "sphinxcontrib.mermaid", # Mermaid diagrams "myst_parser", # Markdown support - 'sphinxcontrib.mermaid', # Mermaid diagrams "sphinx.ext.viewcode", # Source code links "sphinx.ext.napoleon", # Google-style and NumPy-style docstrings "sphinx.ext.autosectionlabel", @@ -54,8 +63,10 @@ myst_enable_extensions = [ "smartquotes", # Smart quotes "linkify", # Auto-detect links "substitution", # Text substitutions + "attrs_block", ] myst_heading_anchors = 2 +myst_fence_as_directive = ["mermaid"] source_suffix = { ".rst": "restructuredtext", @@ -63,6 +74,6 @@ source_suffix = { } # -- Options for HTML output ------------------------------------------------- -html_theme = 'furo' +html_theme = 'sphinx_book_theme' # html_static_path = ['_static'] diff --git a/docs/source/core_modules.md b/docs/source/core_modules.md new file mode 100644 index 0000000..8fd548e --- /dev/null +++ b/docs/source/core_modules.md @@ -0,0 +1,26 @@ +# Module Documentation + +These pages describe the core modules that come with `auto-archiver` and provide the basic functionality for archiving websites on the internet. There are five core module types: + +1. Feeders - these 'feed' information (the URLs) from various sources to the `auto-archiver` for processing +2. Extractors - these 'extract' the page data for a given URL that is fed in by a feeder +3. Enrichers - these 'enrich' the data extracted in the previous step with additional information +4. Storage - these 'store' the data in a persistent location (on disk, Google Drive etc.) +5. Databases - these 'store' the status of the entire archiving process in a log file or database. + + +```{include} modules/autogen/module_list.md +``` + + +```{toctree} +:maxdepth: 1 +:caption: Core Modules +:hidden: + +modules/feeder +modules/extractor +modules/enricher +modules/storage +modules/database +``` \ No newline at end of file diff --git a/docs/source/core_modules.rst b/docs/source/core_modules.rst deleted file mode 100644 index d7b0e66..0000000 --- a/docs/source/core_modules.rst +++ /dev/null @@ -1,11 +0,0 @@ -Core Modules -============ - -These pages are intended for developers of the `auto-archiver` package, and include documentation on the core classes and functions used by the auto-archiver - -.. toctree:: - :titlesonly: - - {% for page in pages|selectattr("is_top_level_object") %} - {{ page.include_path }} - {% endfor %} diff --git a/docs/source/development/testing.md b/docs/source/development/testing.md index 6db7e37..5de9574 100644 --- a/docs/source/development/testing.md +++ b/docs/source/development/testing.md @@ -1,13 +1,21 @@ -### Testing +# Testing -Tests are split using `pytest.mark` into 'core' and 'download' tests. Download tests will hit the network and make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed. +`pytest` is used for testing. There are two main types of tests: -Tests can be run as follows: +1. 'core' tests which should be run on every change +2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed. + + +## Running Tests + +1. Make sure you've installed the dev dependencies with `pytest install --with dev` +2. Tests can be run as follows: ``` +#### Command prefix of 'poetry run' removed here for simplicity # run core tests -pytest -ra -v -m "not download" # or poetry run pytest -ra -v -m "not download" +pytest -ra -v -m "not download" # run download tests -pytest -ra -v -m "download" # or poetry run pytest -ra -v -m "download" +pytest -ra -v -m "download" # run all tests -pytest -ra -v # or poetry run pytest -ra -v +pytest -ra -v ``` \ No newline at end of file diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..0c64a13 --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,16 @@ + +```{include} ../../README.md +``` + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: Contents: + +Overview +user_guidelines +installation/installation.rst +core_modules.md +development/developer_guidelines +autoapi/index.rst +``` \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 517cbe4..0000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,18 +0,0 @@ - -.. include:: ../../README.md - :parser: myst - - -.. toctree:: - - :maxdepth: 2 - :hidden: - :caption: Contents: - - Overview - user_guidelines - installation/installation - development/developer_guidelines - autoapi/index - core_modules - diff --git a/docs/source/installation/installation.md b/docs/source/installation/installation.md index ad4868c..99c6bf6 100644 --- a/docs/source/installation/installation.md +++ b/docs/source/installation/installation.md @@ -3,7 +3,7 @@ There are 3 main ways to use the auto-archiver: 1. Easiest: [via docker](#installing-with-docker) -2. Local Install: [using pip](#local-installing-with-pip) +2. Local Install: [using pip](#installing-locally-with-pip) 3. Developer Install: [see the developer guidelines](../development/developer_guidelines) @@ -17,9 +17,9 @@ But **you always need a configuration/orchestration file**, which is where you'l Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag. -1. install [docker](https://docs.docker.com/get-docker/) -2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver` -3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down: +1. Install [docker](https://docs.docker.com/get-docker/) +2. Pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver` +3. Run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down: 1. `docker run` tells docker to start a new container (an instance of the image) 2. `--rm` makes sure this container is removed after execution (less garbage locally) 3. `-v $PWD/secrets:/app/secrets` - your secrets folder @@ -31,12 +31,12 @@ Docker works like a virtual machine running inside your computer, it isolates ev 2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker 3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file -## Local installing with Pip +## Installing Locally with Pip -1. make sure you have python 3.10 or higher installed -2. install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver` -3. test it's installed with `auto-archiver --help` -4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise +1. Make sure you have python 3.10 or higher installed +2. Install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver` +3. Test it's installed with `auto-archiver --help` +4. Run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise ### Installing Local Requirements diff --git a/docs/source/modules/database.md b/docs/source/modules/database.md new file mode 100644 index 0000000..9719a62 --- /dev/null +++ b/docs/source/modules/database.md @@ -0,0 +1,8 @@ +# Database Modules + +Database modules are used to store the status and results of the extraction and enrichment processes somewhere. The database modules are responsible for creating and managing entires for each item that has been processed. + +The default (enabled) databases are the CSV Database and the Console Database. + +```{include} autogen/database.md +``` diff --git a/docs/source/modules/enricher.md b/docs/source/modules/enricher.md new file mode 100644 index 0000000..b27958e --- /dev/null +++ b/docs/source/modules/enricher.md @@ -0,0 +1,7 @@ +# Enricher Modules + +Enricher modules are used to add additional information to the items that have been extracted. Common enrichment tasks include adding metadata to items, such as the hash of the item, a screenshot of the webpage when the item was extracted, or general metadata like the date and time the item was extracted. + + +```{include} autogen/enricher.md +``` \ No newline at end of file diff --git a/docs/source/modules/extractor.md b/docs/source/modules/extractor.md new file mode 100644 index 0000000..ddd09b9 --- /dev/null +++ b/docs/source/modules/extractor.md @@ -0,0 +1,11 @@ +# Extractor Modules + +Extractor modules are used to extract the content of a given URL. Typically, one extractor will work for one website or platform (e.g. a Telegram extractor or an Instagram), however, there are several wide-ranging extractors which work for a wide range of websites. + +Extractors that are able to extract content from a wide range of websites include: +1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library. +2. Wayback Machine Extractor: sends pages to the Waygback machine for archiving, and stores the link. +3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format. + +```{include} autogen/extractor.md +``` \ No newline at end of file diff --git a/docs/source/modules/feeder.md b/docs/source/modules/feeder.md new file mode 100644 index 0000000..5ba77a4 --- /dev/null +++ b/docs/source/modules/feeder.md @@ -0,0 +1,8 @@ +# Feeder Modules + +Feeder modules are used to feed URLs into the `auto-archiver` for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line. + +The default feeder is the command line feeder, which allows you to input URLs directly into the `auto-archiver` from the command line. + +```{include} autogen/feeder.md +``` \ No newline at end of file diff --git a/docs/source/modules/formatter.md b/docs/source/modules/formatter.md new file mode 100644 index 0000000..08bc5fb --- /dev/null +++ b/docs/source/modules/formatter.md @@ -0,0 +1,6 @@ +# Formatter Modules + +Formatter modules are used to format the data extracted from a URL into a specific format. Currently the most widely-used formatter is the HTML formatter, which formats the data into an easily viewable HTML page. + +```{include} autogen/formatter.md +``` \ No newline at end of file diff --git a/docs/source/modules/storage.md b/docs/source/modules/storage.md new file mode 100644 index 0000000..998409a --- /dev/null +++ b/docs/source/modules/storage.md @@ -0,0 +1,8 @@ +# Storage Modules + +Storage modules are used to store the data extracted from a URL in a persistent location. This can be on your local hard disk, or on a remote server (e.g. S3 or Google Drive). + +The default is to store the files downloaded (e.g. images, videos) in a local directory. + +```{include} autogen/storage.md +``` \ No newline at end of file diff --git a/src/auto_archiver/modules/wacz_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_enricher/__manifest__.py index 46ce05e..bebfc9e 100644 --- a/src/auto_archiver/modules/wacz_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wacz_enricher/__manifest__.py @@ -1,6 +1,6 @@ { "name": "WACZ Enricher", - "type": ["enricher", "archiver"], + "type": ["enricher", "extractor"], "entry_point": "wacz_enricher::WaczExtractorEnricher", "requires_setup": True, "dependencies": { diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py index baecc14..4832265 100644 --- a/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wayback_extractor_enricher/__manifest__.py @@ -1,6 +1,6 @@ { "name": "Wayback Machine Enricher", - "type": ["enricher", "archiver"], + "type": ["enricher", "extractor"], "entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher", "requires_setup": True, "dependencies": {