kopia lustrzana https://github.com/bellingcat/auto-archiver
Use a script to auto-generate documentation for the core modules from the manifest file
rodzic
824728739a
commit
2650cd8fb2
|
@ -32,3 +32,4 @@ archived/
|
|||
dist*
|
||||
docs/_build/
|
||||
docs/source/autoapi/
|
||||
docs/source/modules/autogen/
|
||||
|
|
|
@ -17,7 +17,7 @@ Python tool to automatically archive social media posts, videos, and images from
|
|||
|
||||
## Installation
|
||||
|
||||
For full For instructions on how to install auto-archiver, view the [Installation Guide](docs/source/installation.md)
|
||||
For full instructions on how to install auto-archiver, view the [Installation Guide](docs/source/installation.md)
|
||||
|
||||
Quick run using docker:
|
||||
|
||||
|
@ -65,7 +65,9 @@ auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,u
|
|||
```
|
||||
|
||||
Here's the complete workflow that the auto-archiver goes through:
|
||||
```{mermaid}
|
||||
|
||||
```mermaid
|
||||
|
||||
graph TD
|
||||
s((start)) --> F(fa:fa-table Feeder)
|
||||
F -->|get and clean URL| D1{fa:fa-database Database}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from scripts import generate_module_docs
|
|
@ -0,0 +1,79 @@
|
|||
# iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table
|
||||
from pathlib import Path
|
||||
from auto_archiver.core.module import available_modules
|
||||
from auto_archiver.core.base_module import BaseModule
|
||||
|
||||
MODULES_FOLDER = Path(__file__).parent.parent.parent.parent / "src" / "auto_archiver" / "modules"
|
||||
SAVE_FOLDER = Path(__file__).parent.parent / "source" / "modules" / "autogen"
|
||||
|
||||
type_color = {
|
||||
'feeder': "<span style='color: #FFA500'>[feeder](/core_modules.md#feeder-modules)</a></span>",
|
||||
'extractor': "<span style='color: #00FF00'>[extractor](/core_modules.md#extractor-modules)</a></span>",
|
||||
'enricher': "<span style='color: #0000FF'>[enricher](/core_modules.md#enricher-modules)</a></span>",
|
||||
'database': "<span style='color: #FF00FF'>[database](/core_modules.md#database-modules)</a></span>",
|
||||
'storage': "<span style='color: #FFFF00'>[storage](/core_modules.md#storage-modules)</a></span>",
|
||||
'formatter': "<span style='color: #00FFFF'>[formatter](/core_modules.md#formatter-modules)</a></span>",
|
||||
}
|
||||
|
||||
|
||||
def generate_module_docs():
|
||||
SAVE_FOLDER.mkdir(exist_ok=True)
|
||||
modules_by_type = {}
|
||||
|
||||
for module in available_modules(with_manifest=True):
|
||||
# generate the markdown file from the __manifest__.py file.
|
||||
|
||||
manifest = module.manifest
|
||||
for type in manifest['type']:
|
||||
modules_by_type.setdefault(type, []).append(module)
|
||||
|
||||
description = "\n".join(l.lstrip() for l in manifest['description'].split("\n"))
|
||||
types = ", ".join(type_color[t] for t in manifest['type'])
|
||||
readme_str = f"""
|
||||
# {manifest['name']}
|
||||
```{{admonition}} Module type
|
||||
|
||||
{types}
|
||||
```
|
||||
{description}
|
||||
"""
|
||||
if manifest['configs']:
|
||||
readme_str += "\n## Configuration Options\n"
|
||||
readme_str += "| Option | Description | Default | Type |\n"
|
||||
readme_str += "| --- | --- | --- | --- |\n"
|
||||
for key, value in manifest['configs'].items():
|
||||
type = value.get('type', 'string')
|
||||
if type == 'auto_archiver.utils.json_loader':
|
||||
value['type'] = 'json'
|
||||
elif type == 'str':
|
||||
type = "string"
|
||||
|
||||
help = "**Required**. " if value.get('required', False) else "Optional. "
|
||||
help += value.get('help', '')
|
||||
readme_str += f"| `{module.name}.{key}` | {help} | {value.get('default', '')} | {type} |\n"
|
||||
|
||||
|
||||
# make type folder if it doesn't exist
|
||||
|
||||
|
||||
with open(SAVE_FOLDER / f"{module.name}.md", "w") as f:
|
||||
print("writing", SAVE_FOLDER)
|
||||
f.write(readme_str)
|
||||
|
||||
generate_index(modules_by_type)
|
||||
|
||||
def generate_index(modules_by_type):
|
||||
readme_str = ""
|
||||
for type in BaseModule.MODULE_TYPES:
|
||||
modules = modules_by_type.get(type, [])
|
||||
module_str = f"## {type.capitalize()} Modules\n"
|
||||
for module in modules:
|
||||
module_str += f"\n[{module.manifest['name']}](/modules/autogen/{module.name}.md)\n"
|
||||
with open(SAVE_FOLDER / f"{type}.md", "w") as f:
|
||||
print("writing", SAVE_FOLDER / f"{type}.md")
|
||||
f.write(module_str)
|
||||
readme_str += module_str
|
||||
|
||||
with open(SAVE_FOLDER / "module_list.md", "w") as f:
|
||||
print("writing", SAVE_FOLDER / "module_list.md")
|
||||
f.write(readme_str)
|
|
@ -1,20 +1,29 @@
|
|||
# Configuration file for the Sphinx documentation builder.
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
import sys
|
||||
import os
|
||||
from importlib.metadata import metadata
|
||||
|
||||
sys.path.insert(0, os.path.abspath('../scripts'))
|
||||
from scripts import generate_module_docs
|
||||
|
||||
# -- Project Hooks -----------------------------------------------------------
|
||||
# convert the module __manifest__.py files into markdown files
|
||||
generate_module_docs()
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
package_metadata = metadata("auto-archiver")
|
||||
project = package_metadata["name"]
|
||||
authors = package_metadata["authors"]
|
||||
release = package_metadata["version"]
|
||||
|
||||
language = 'en'
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
extensions = [
|
||||
"autoapi.extension", # Generate API documentation from docstrings
|
||||
"sphinxcontrib.mermaid", # Mermaid diagrams
|
||||
"myst_parser", # Markdown support
|
||||
'sphinxcontrib.mermaid', # Mermaid diagrams
|
||||
"sphinx.ext.viewcode", # Source code links
|
||||
"sphinx.ext.napoleon", # Google-style and NumPy-style docstrings
|
||||
"sphinx.ext.autosectionlabel",
|
||||
|
@ -54,8 +63,10 @@ myst_enable_extensions = [
|
|||
"smartquotes", # Smart quotes
|
||||
"linkify", # Auto-detect links
|
||||
"substitution", # Text substitutions
|
||||
"attrs_block",
|
||||
]
|
||||
myst_heading_anchors = 2
|
||||
myst_fence_as_directive = ["mermaid"]
|
||||
|
||||
source_suffix = {
|
||||
".rst": "restructuredtext",
|
||||
|
@ -63,6 +74,6 @@ source_suffix = {
|
|||
}
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
html_theme = 'furo'
|
||||
html_theme = 'sphinx_book_theme'
|
||||
# html_static_path = ['_static']
|
||||
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# Module Documentation
|
||||
|
||||
These pages describe the core modules that come with `auto-archiver` and provide the basic functionality for archiving websites on the internet. There are five core module types:
|
||||
|
||||
1. Feeders - these 'feed' information (the URLs) from various sources to the `auto-archiver` for processing
|
||||
2. Extractors - these 'extract' the page data for a given URL that is fed in by a feeder
|
||||
3. Enrichers - these 'enrich' the data extracted in the previous step with additional information
|
||||
4. Storage - these 'store' the data in a persistent location (on disk, Google Drive etc.)
|
||||
5. Databases - these 'store' the status of the entire archiving process in a log file or database.
|
||||
|
||||
|
||||
```{include} modules/autogen/module_list.md
|
||||
```
|
||||
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
:caption: Core Modules
|
||||
:hidden:
|
||||
|
||||
modules/feeder
|
||||
modules/extractor
|
||||
modules/enricher
|
||||
modules/storage
|
||||
modules/database
|
||||
```
|
|
@ -1,11 +0,0 @@
|
|||
Core Modules
|
||||
============
|
||||
|
||||
These pages are intended for developers of the `auto-archiver` package, and include documentation on the core classes and functions used by the auto-archiver
|
||||
|
||||
.. toctree::
|
||||
:titlesonly:
|
||||
|
||||
{% for page in pages|selectattr("is_top_level_object") %}
|
||||
{{ page.include_path }}
|
||||
{% endfor %}
|
|
@ -1,13 +1,21 @@
|
|||
### Testing
|
||||
# Testing
|
||||
|
||||
Tests are split using `pytest.mark` into 'core' and 'download' tests. Download tests will hit the network and make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed.
|
||||
`pytest` is used for testing. There are two main types of tests:
|
||||
|
||||
Tests can be run as follows:
|
||||
1. 'core' tests which should be run on every change
|
||||
2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed.
|
||||
|
||||
|
||||
## Running Tests
|
||||
|
||||
1. Make sure you've installed the dev dependencies with `pytest install --with dev`
|
||||
2. Tests can be run as follows:
|
||||
```
|
||||
#### Command prefix of 'poetry run' removed here for simplicity
|
||||
# run core tests
|
||||
pytest -ra -v -m "not download" # or poetry run pytest -ra -v -m "not download"
|
||||
pytest -ra -v -m "not download"
|
||||
# run download tests
|
||||
pytest -ra -v -m "download" # or poetry run pytest -ra -v -m "download"
|
||||
pytest -ra -v -m "download"
|
||||
# run all tests
|
||||
pytest -ra -v # or poetry run pytest -ra -v
|
||||
pytest -ra -v
|
||||
```
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
```{include} ../../README.md
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
:caption: Contents:
|
||||
|
||||
Overview <self>
|
||||
user_guidelines
|
||||
installation/installation.rst
|
||||
core_modules.md
|
||||
development/developer_guidelines
|
||||
autoapi/index.rst
|
||||
```
|
|
@ -1,18 +0,0 @@
|
|||
|
||||
.. include:: ../../README.md
|
||||
:parser: myst
|
||||
|
||||
|
||||
.. toctree::
|
||||
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
:caption: Contents:
|
||||
|
||||
Overview <self>
|
||||
user_guidelines
|
||||
installation/installation
|
||||
development/developer_guidelines
|
||||
autoapi/index
|
||||
core_modules
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
There are 3 main ways to use the auto-archiver:
|
||||
1. Easiest: [via docker](#installing-with-docker)
|
||||
2. Local Install: [using pip](#local-installing-with-pip)
|
||||
2. Local Install: [using pip](#installing-locally-with-pip)
|
||||
3. Developer Install: [see the developer guidelines](../development/developer_guidelines)
|
||||
|
||||
|
||||
|
@ -17,9 +17,9 @@ But **you always need a configuration/orchestration file**, which is where you'l
|
|||
Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag.
|
||||
|
||||
|
||||
1. install [docker](https://docs.docker.com/get-docker/)
|
||||
2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
|
||||
3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
|
||||
1. Install [docker](https://docs.docker.com/get-docker/)
|
||||
2. Pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
|
||||
3. Run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
|
||||
1. `docker run` tells docker to start a new container (an instance of the image)
|
||||
2. `--rm` makes sure this container is removed after execution (less garbage locally)
|
||||
3. `-v $PWD/secrets:/app/secrets` - your secrets folder
|
||||
|
@ -31,12 +31,12 @@ Docker works like a virtual machine running inside your computer, it isolates ev
|
|||
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
|
||||
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
|
||||
|
||||
## Local installing with Pip
|
||||
## Installing Locally with Pip
|
||||
|
||||
1. make sure you have python 3.10 or higher installed
|
||||
2. install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver`
|
||||
3. test it's installed with `auto-archiver --help`
|
||||
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
|
||||
1. Make sure you have python 3.10 or higher installed
|
||||
2. Install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver`
|
||||
3. Test it's installed with `auto-archiver --help`
|
||||
4. Run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
|
||||
|
||||
### Installing Local Requirements
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
# Database Modules
|
||||
|
||||
Database modules are used to store the status and results of the extraction and enrichment processes somewhere. The database modules are responsible for creating and managing entires for each item that has been processed.
|
||||
|
||||
The default (enabled) databases are the CSV Database and the Console Database.
|
||||
|
||||
```{include} autogen/database.md
|
||||
```
|
|
@ -0,0 +1,7 @@
|
|||
# Enricher Modules
|
||||
|
||||
Enricher modules are used to add additional information to the items that have been extracted. Common enrichment tasks include adding metadata to items, such as the hash of the item, a screenshot of the webpage when the item was extracted, or general metadata like the date and time the item was extracted.
|
||||
|
||||
|
||||
```{include} autogen/enricher.md
|
||||
```
|
|
@ -0,0 +1,11 @@
|
|||
# Extractor Modules
|
||||
|
||||
Extractor modules are used to extract the content of a given URL. Typically, one extractor will work for one website or platform (e.g. a Telegram extractor or an Instagram), however, there are several wide-ranging extractors which work for a wide range of websites.
|
||||
|
||||
Extractors that are able to extract content from a wide range of websites include:
|
||||
1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library.
|
||||
2. Wayback Machine Extractor: sends pages to the Waygback machine for archiving, and stores the link.
|
||||
3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format.
|
||||
|
||||
```{include} autogen/extractor.md
|
||||
```
|
|
@ -0,0 +1,8 @@
|
|||
# Feeder Modules
|
||||
|
||||
Feeder modules are used to feed URLs into the `auto-archiver` for processing. Feeders can take these URLs from a variety of sources, such as a file, a database, or the command line.
|
||||
|
||||
The default feeder is the command line feeder, which allows you to input URLs directly into the `auto-archiver` from the command line.
|
||||
|
||||
```{include} autogen/feeder.md
|
||||
```
|
|
@ -0,0 +1,6 @@
|
|||
# Formatter Modules
|
||||
|
||||
Formatter modules are used to format the data extracted from a URL into a specific format. Currently the most widely-used formatter is the HTML formatter, which formats the data into an easily viewable HTML page.
|
||||
|
||||
```{include} autogen/formatter.md
|
||||
```
|
|
@ -0,0 +1,8 @@
|
|||
# Storage Modules
|
||||
|
||||
Storage modules are used to store the data extracted from a URL in a persistent location. This can be on your local hard disk, or on a remote server (e.g. S3 or Google Drive).
|
||||
|
||||
The default is to store the files downloaded (e.g. images, videos) in a local directory.
|
||||
|
||||
```{include} autogen/storage.md
|
||||
```
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "WACZ Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wacz_enricher::WaczExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
|
|
Ładowanie…
Reference in New Issue