kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'load_modules' into timestamping_rewrite
commit
7bb4d68a22
|
@ -1025,7 +1025,7 @@ version = "0.7.3"
|
|||
description = "Python logging made (stupidly) simple"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.5"
|
||||
groups = ["main"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c"},
|
||||
{file = "loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6"},
|
||||
|
@ -1750,6 +1750,24 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""}
|
|||
[package.extras]
|
||||
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-loguru"
|
||||
version = "0.4.0"
|
||||
description = "Pytest Loguru"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pytest_loguru-0.4.0-py3-none-any.whl", hash = "sha256:3cc7b9c6b22cb158209ccbabf0d678dacd3f3c7497d6f46f1c338c13bee1ac77"},
|
||||
{file = "pytest_loguru-0.4.0.tar.gz", hash = "sha256:0d9e4e72ae9bfd92f774c666e7353766af11b0b78edd59c290e89be116050f03"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
loguru = "*"
|
||||
|
||||
[package.extras]
|
||||
test = ["pytest", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
|
@ -1818,7 +1836,7 @@ version = "6.0.2"
|
|||
description = "YAML parser and emitter for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main", "docs"]
|
||||
groups = ["docs"]
|
||||
files = [
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
|
||||
|
@ -2086,6 +2104,82 @@ files = [
|
|||
[package.dependencies]
|
||||
pyasn1 = ">=0.1.3"
|
||||
|
||||
[[package]]
|
||||
name = "ruamel-yaml"
|
||||
version = "0.18.10"
|
||||
description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "ruamel.yaml-0.18.10-py3-none-any.whl", hash = "sha256:30f22513ab2301b3d2b577adc121c6471f28734d3d9728581245f1e76468b4f1"},
|
||||
{file = "ruamel.yaml-0.18.10.tar.gz", hash = "sha256:20c86ab29ac2153f80a428e1254a8adf686d3383df04490514ca3b79a362db58"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
"ruamel.yaml.clib" = {version = ">=0.2.7", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.13\""}
|
||||
|
||||
[package.extras]
|
||||
docs = ["mercurial (>5.7)", "ryd"]
|
||||
jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "ruamel-yaml-clib"
|
||||
version = "0.2.12"
|
||||
description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
markers = "platform_python_implementation == \"CPython\""
|
||||
files = [
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:11f891336688faf5156a36293a9c362bdc7c88f03a8a027c2c1d8e0bcde998e5"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
|
||||
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
|
||||
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "s3transfer"
|
||||
version = "0.11.2"
|
||||
|
@ -2956,7 +3050,7 @@ version = "1.2.0"
|
|||
description = "A small Python utility to set file creation time on Windows"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
groups = ["main"]
|
||||
groups = ["main", "dev"]
|
||||
markers = "sys_platform == \"win32\""
|
||||
files = [
|
||||
{file = "win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390"},
|
||||
|
@ -2983,14 +3077,14 @@ h11 = ">=0.9.0,<1"
|
|||
|
||||
[[package]]
|
||||
name = "yt-dlp"
|
||||
version = "2025.1.12"
|
||||
version = "2025.1.26"
|
||||
description = "A feature-rich command-line audio/video downloader"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "yt_dlp-2025.1.12-py3-none-any.whl", hash = "sha256:f7ea19afb64f8e457a1b9598ddb67f8deaa313bf1d57abd5612db9272ab10795"},
|
||||
{file = "yt_dlp-2025.1.12.tar.gz", hash = "sha256:8e7e246e2a5a2cff0a9c13db46844a37a547680702012058c94ec18fce0ca25a"},
|
||||
{file = "yt_dlp-2025.1.26-py3-none-any.whl", hash = "sha256:3e76bd896b9f96601021ca192ca0fbdd195e3c3dcc28302a3a34c9bc4979da7b"},
|
||||
{file = "yt_dlp-2025.1.26.tar.gz", hash = "sha256:1c9738266921ad43c568ad01ac3362fb7c7af549276fbec92bd72f140da16240"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
|
@ -3006,4 +3100,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "d1af74e7fc7c919eda55dd383208edab906508353b4a9eff8e979967484823f8"
|
||||
content-hash = "9ca114395e73af8982abbccc25b385bbca62e50ba7cca8239e52e5c1227cb4b0"
|
||||
|
|
|
@ -37,7 +37,6 @@ dependencies = [
|
|||
"pdqhash (>=0.0.0)",
|
||||
"pillow (>=0.0.0)",
|
||||
"python-slugify (>=0.0.0)",
|
||||
"pyyaml (>=0.0.0)",
|
||||
"dateparser (>=0.0.0)",
|
||||
"python-twitter-v2 (>=0.0.0)",
|
||||
"instaloader (>=0.0.0)",
|
||||
|
@ -47,7 +46,7 @@ dependencies = [
|
|||
"cryptography (>=41.0.0,<42.0.0)",
|
||||
"boto3 (>=1.28.0,<2.0.0)",
|
||||
"dataclasses-json (>=0.0.0)",
|
||||
"yt-dlp (==2025.1.12)",
|
||||
"yt-dlp (>=2025.1.26,<2026.0.0)",
|
||||
"numpy (==2.1.3)",
|
||||
"vk-url-scraper (>=0.0.0)",
|
||||
"requests[socks] (>=0.0.0)",
|
||||
|
@ -58,11 +57,13 @@ dependencies = [
|
|||
"tsp-client (>=0.0.0)",
|
||||
"certvalidator (>=0.0.0)",
|
||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.3.4"
|
||||
autopep8 = "^2.3.1"
|
||||
pytest-loguru = "^0.4.0"
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
sphinx = "^8.1.3"
|
||||
|
|
|
@ -12,7 +12,7 @@ from googleapiclient.errors import HttpError
|
|||
# Code below from https://developers.google.com/drive/api/quickstart/python
|
||||
# Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
SCOPES = ["https://www.googleapis.com/auth/drive.file"]
|
||||
|
||||
|
||||
@click.command(
|
||||
|
@ -23,7 +23,7 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
|
|||
"-c",
|
||||
type=click.Path(exists=True),
|
||||
help="path to the credentials.json file downloaded from https://console.cloud.google.com/apis/credentials",
|
||||
required=True
|
||||
required=True,
|
||||
)
|
||||
@click.option(
|
||||
"--token",
|
||||
|
@ -31,59 +31,62 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
|
|||
type=click.Path(exists=False),
|
||||
default="gd-token.json",
|
||||
help="file where to place the OAuth token, defaults to gd-token.json which you must then move to where your orchestration file points to, defaults to gd-token.json",
|
||||
required=True
|
||||
required=True,
|
||||
)
|
||||
def main(credentials, token):
|
||||
# The file token.json stores the user's access and refresh tokens, and is
|
||||
# created automatically when the authorization flow completes for the first time.
|
||||
creds = None
|
||||
if os.path.exists(token):
|
||||
with open(token, 'r') as stream:
|
||||
with open(token, "r") as stream:
|
||||
creds_json = json.load(stream)
|
||||
# creds = Credentials.from_authorized_user_file(creds_json, SCOPES)
|
||||
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
|
||||
creds_json["refresh_token"] = creds_json.get("refresh_token", "")
|
||||
creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
|
||||
|
||||
# If there are no (valid) credentials available, let the user log in.
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
print('Requesting new token')
|
||||
print("Requesting new token")
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
print('First run through so putting up login dialog')
|
||||
print("First run through so putting up login dialog")
|
||||
# credentials.json downloaded from https://console.cloud.google.com/apis/credentials
|
||||
flow = InstalledAppFlow.from_client_secrets_file(credentials, SCOPES)
|
||||
creds = flow.run_local_server(port=55192)
|
||||
# Save the credentials for the next run
|
||||
with open(token, 'w') as token:
|
||||
print('Saving new token')
|
||||
with open(token, "w") as token:
|
||||
print("Saving new token")
|
||||
token.write(creds.to_json())
|
||||
else:
|
||||
print('Token valid')
|
||||
print("Token valid")
|
||||
|
||||
try:
|
||||
service = build('drive', 'v3', credentials=creds)
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
|
||||
# About the user
|
||||
results = service.about().get(fields="*").execute()
|
||||
emailAddress = results['user']['emailAddress']
|
||||
emailAddress = results["user"]["emailAddress"]
|
||||
print(emailAddress)
|
||||
|
||||
# Call the Drive v3 API and return some files
|
||||
results = service.files().list(
|
||||
pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
||||
items = results.get('files', [])
|
||||
results = (
|
||||
service.files()
|
||||
.list(pageSize=10, fields="nextPageToken, files(id, name)")
|
||||
.execute()
|
||||
)
|
||||
items = results.get("files", [])
|
||||
|
||||
if not items:
|
||||
print('No files found.')
|
||||
print("No files found.")
|
||||
return
|
||||
print('Files:')
|
||||
print("Files:")
|
||||
for item in items:
|
||||
print(u'{0} ({1})'.format(item['name'], item['id']))
|
||||
print("{0} ({1})".format(item["name"], item["id"]))
|
||||
|
||||
except HttpError as error:
|
||||
print(f'An error occurred: {error}')
|
||||
print(f"An error occurred: {error}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
"""
|
||||
This script is used to create a new session file for the Telegram client.
|
||||
To do this you must first create a Telegram application at https://my.telegram.org/apps
|
||||
And store your id and hash in the environment variables TELEGRAM_API_ID and TELEGRAM_API_HASH.
|
||||
Create a .env file, or add the following to your environment :
|
||||
```
|
||||
export TELEGRAM_API_ID=[YOUR_ID_HERE]
|
||||
export TELEGRAM_API_HASH=[YOUR_HASH_HERE]
|
||||
```
|
||||
Then run this script to create a new session file.
|
||||
|
||||
You will need to provide your phone number and a 2FA code the first time you run this script.
|
||||
"""
|
||||
|
||||
|
||||
import os
|
||||
from telethon.sync import TelegramClient
|
||||
from loguru import logger
|
||||
|
||||
|
||||
# Create a
|
||||
API_ID = os.getenv("TELEGRAM_API_ID")
|
||||
API_HASH = os.getenv("TELEGRAM_API_HASH")
|
||||
SESSION_FILE = "secrets/anon-insta"
|
||||
|
||||
os.makedirs("secrets", exist_ok=True)
|
||||
with TelegramClient(SESSION_FILE, API_ID, API_HASH) as client:
|
||||
logger.success(f"New session file created: {SESSION_FILE}.session")
|
||||
|
|
@ -1,8 +1,9 @@
|
|||
""" Entry point for the auto_archiver package. """
|
||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
import sys
|
||||
|
||||
def main():
|
||||
ArchivingOrchestrator().run()
|
||||
ArchivingOrchestrator().run(sys.argv[1:])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .module import BaseModule
|
||||
from .context import ArchivingContext
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
|
||||
from urllib.parse import urlparse
|
||||
from typing import Mapping, Any
|
||||
from abc import ABC
|
||||
from copy import deepcopy, copy
|
||||
from tempfile import TemporaryDirectory
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
|
||||
from loguru import logger
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
"""
|
||||
Base module class. All modules should inherit from this class.
|
||||
|
||||
The exact methods a class implements will depend on the type of module it is,
|
||||
however modules can have a .setup() method to run any setup code
|
||||
(e.g. logging in to a site, spinning up a browser etc.)
|
||||
|
||||
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
||||
a subclass can be of multiple types. For example, a module that extracts data from
|
||||
a website and stores it in a database would be both an 'extractor' and a 'database' module.
|
||||
|
||||
Each module is a python package, and should have a __manifest__.py file in the
|
||||
same directory as the module file. The __manifest__.py specifies the module information
|
||||
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
|
||||
default manifest structure.
|
||||
|
||||
"""
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
config: Mapping[str, Any]
|
||||
authentication: Mapping[str, Mapping[str, str]]
|
||||
name: str
|
||||
|
||||
# this is set by the orchestrator prior to archiving
|
||||
tmp_dir: TemporaryDirectory = None
|
||||
|
||||
@property
|
||||
def storages(self) -> list:
|
||||
return self.config.get('storages', [])
|
||||
|
||||
def config_setup(self, config: dict):
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
# this is important. Each instance is given its own deepcopied config, so modules cannot
|
||||
# change values to affect other modules
|
||||
config = deepcopy(config)
|
||||
authentication = deepcopy(config.pop('authentication', {}))
|
||||
|
||||
self.authentication = authentication
|
||||
self.config = config
|
||||
for key, val in config.get(self.name, {}).items():
|
||||
setattr(self, key, val)
|
||||
|
||||
def setup(self):
|
||||
# For any additional setup required by modules, e.g. autehntication
|
||||
pass
|
||||
|
||||
def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
|
||||
"""
|
||||
Returns the authentication information for a given site. This is used to authenticate
|
||||
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
|
||||
|
||||
extract_cookies: bool - whether or not to extract cookies from the given browser and return the
|
||||
cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
|
||||
|
||||
Currently, the dict can have keys of the following types:
|
||||
- username: str - the username to use for login
|
||||
- password: str - the password to use for login
|
||||
- api_key: str - the API key to use for login
|
||||
- api_secret: str - the API secret to use for login
|
||||
- cookie: str - a cookie string to use for login (specific to this site)
|
||||
- cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
|
||||
"""
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||
|
||||
site = UrlUtil.domain_for_url(site)
|
||||
# add the 'www' version of the site to the list of sites to check
|
||||
authdict = {}
|
||||
|
||||
|
||||
for to_try in [site, f"www.{site}"]:
|
||||
if to_try in self.authentication:
|
||||
authdict.update(self.authentication[to_try])
|
||||
break
|
||||
|
||||
# do a fuzzy string match just to print a warning - don't use it since it's insecure
|
||||
if not authdict:
|
||||
for key in self.authentication.keys():
|
||||
if key in site or site in key:
|
||||
logger.debug(f"Could not find exact authentication information for site '{site}'. \
|
||||
did find information for '{key}' which is close, is this what you meant? \
|
||||
If so, edit your authentication settings to make sure it exactly matches.")
|
||||
|
||||
def get_ytdlp_cookiejar(args):
|
||||
import yt_dlp
|
||||
from yt_dlp import parse_options
|
||||
logger.debug(f"Extracting cookies from settings: {args[1]}")
|
||||
# parse_options returns a named tuple as follows, we only need the ydl_options part
|
||||
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
|
||||
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
|
||||
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
|
||||
|
||||
# get the cookies jar, prefer the browser cookies than the file
|
||||
if 'cookies_from_browser' in self.authentication:
|
||||
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
|
||||
if extract_cookies:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
|
||||
elif 'cookies_file' in self.authentication:
|
||||
authdict['cookies_file'] = self.authentication['cookies_file']
|
||||
if extract_cookies:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
|
||||
|
||||
return authdict
|
||||
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
|
@ -11,20 +11,39 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
|
|||
from loguru import logger
|
||||
|
||||
from copy import deepcopy
|
||||
from .module import MODULE_TYPES
|
||||
from .module import BaseModule
|
||||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
yaml = YAML()
|
||||
_yaml: YAML = YAML()
|
||||
|
||||
EMPTY_CONFIG = yaml.load("""
|
||||
EMPTY_CONFIG = _yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
|
||||
"""
|
||||
|
||||
# Global configuration
|
||||
|
||||
# Authentication
|
||||
# a dictionary of authentication information that can be used by extractors to login to website.
|
||||
# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
|
||||
# Common login 'types' are username/password, cookie, api key/token.
|
||||
# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser.
|
||||
# Some Examples:
|
||||
# facebook.com:
|
||||
# username: "my_username"
|
||||
# password: "my_password"
|
||||
# or for a site that uses an API key:
|
||||
# twitter.com,x.com:
|
||||
# api_key
|
||||
# api_secret
|
||||
# youtube.com:
|
||||
# cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
|
||||
|
||||
authentication: {}
|
||||
|
||||
# These are the global configurations that are used by the modules
|
||||
|
||||
logging:
|
||||
|
@ -48,6 +67,10 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
|||
"""
|
||||
for action in self._actions:
|
||||
if not namespace or action.dest not in namespace:
|
||||
# for actions that are required and already have a default value, remove the 'required' check
|
||||
if action.required and action.default is not None:
|
||||
action.required = False
|
||||
|
||||
if action.default is not None:
|
||||
try:
|
||||
self._check_value(action, action.default)
|
||||
|
@ -120,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
|||
config = None
|
||||
try:
|
||||
with open(yaml_filename, "r", encoding="utf-8") as inf:
|
||||
config = yaml.load(inf)
|
||||
config = _yaml.load(inf)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
@ -132,12 +155,9 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
|||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||
|
||||
|
||||
def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None:
|
||||
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
config_to_save = deepcopy(config)
|
||||
|
||||
for key1, key2 in do_not_store_keys:
|
||||
if key1 in config_to_save and key2 in config_to_save[key1]:
|
||||
del config_to_save[key1][key2]
|
||||
|
||||
config_to_save.pop('urls', None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
yaml.dump(config_to_save, outf)
|
||||
_yaml.dump(config_to_save, outf)
|
|
@ -1,64 +0,0 @@
|
|||
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
|
||||
|
||||
This singleton class allows for:
|
||||
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
|
||||
- Marking certain values to persist across resets using `keep_on_reset`.
|
||||
- Managing temporary directories and other shared data used during the archiving process.
|
||||
|
||||
### Key Features:
|
||||
- Creates a single global instance.
|
||||
- Reset functionality allows for clearing configurations, with options for partial or full resets.
|
||||
- Custom getters and setters for commonly used context values like temporary directories.
|
||||
|
||||
"""
|
||||
|
||||
class ArchivingContext:
|
||||
"""
|
||||
Singleton context class for managing global configurations and temporary data.
|
||||
|
||||
ArchivingContext._get_instance() to retrieve it if needed
|
||||
otherwise just
|
||||
ArchivingContext.set(key, value)
|
||||
and
|
||||
ArchivingContext.get(key, default)
|
||||
|
||||
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
||||
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
||||
"""
|
||||
_instance = None
|
||||
|
||||
def __init__(self):
|
||||
self.configs = {}
|
||||
self.keep_on_reset = set()
|
||||
|
||||
@staticmethod
|
||||
def get_instance():
|
||||
if ArchivingContext._instance is None:
|
||||
ArchivingContext._instance = ArchivingContext()
|
||||
return ArchivingContext._instance
|
||||
|
||||
@staticmethod
|
||||
def set(key, value, keep_on_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
ac.configs[key] = value
|
||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||
|
||||
@staticmethod
|
||||
def get(key: str, default=None):
|
||||
return ArchivingContext.get_instance().configs.get(key, default)
|
||||
|
||||
@staticmethod
|
||||
def reset(full_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
if full_reset: ac.keep_on_reset = set()
|
||||
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||
|
||||
# ---- custom getters/setters for widely used context values
|
||||
|
||||
@staticmethod
|
||||
def set_tmp_dir(tmp_dir: str):
|
||||
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
|
||||
|
||||
@staticmethod
|
||||
def get_tmp_dir() -> str:
|
||||
return ArchivingContext.get_instance().configs.get("tmp_dir")
|
|
@ -1,12 +1,9 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from abc import abstractmethod
|
||||
from typing import Union
|
||||
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
|
||||
@dataclass
|
||||
class Database(BaseModule):
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
|
|
|
@ -9,11 +9,9 @@ the archiving step and before storage or formatting.
|
|||
Enrichers are optional but highly useful for making the archived data more powerful.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
@dataclass
|
||||
class Enricher(BaseModule):
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
|
||||
|
||||
|
|
|
@ -11,20 +11,23 @@ from abc import abstractmethod
|
|||
from dataclasses import dataclass
|
||||
import mimetypes
|
||||
import os
|
||||
import mimetypes, requests
|
||||
import mimetypes
|
||||
import requests
|
||||
from loguru import logger
|
||||
from retrying import retry
|
||||
import re
|
||||
|
||||
from ..core import Metadata, ArchivingContext, BaseModule
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
|
||||
@dataclass
|
||||
class Extractor(BaseModule):
|
||||
"""
|
||||
Base class for implementing extractors in the media archiving framework.
|
||||
Subclasses must implement the `download` method to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
valid_url: re.Pattern = None
|
||||
|
||||
def cleanup(self) -> None:
|
||||
# called when extractors are done, or upon errors, cleanup any resources
|
||||
pass
|
||||
|
@ -32,13 +35,20 @@ class Extractor(BaseModule):
|
|||
def sanitize_url(self, url: str) -> str:
|
||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
return url
|
||||
|
||||
def match_link(self, url: str) -> re.Match:
|
||||
return self.valid_url.match(url)
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
"""
|
||||
Returns True if this extractor can handle the given URL
|
||||
|
||||
Should be overridden by subclasses
|
||||
|
||||
"""
|
||||
if self.valid_url:
|
||||
return self.match_link(url) is not None
|
||||
|
||||
return True
|
||||
|
||||
def _guess_file_type(self, path: str) -> str:
|
||||
|
@ -60,7 +70,7 @@ class Extractor(BaseModule):
|
|||
to_filename = url.split('/')[-1].split('?')[0]
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
|
||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
|
@ -85,5 +95,11 @@ class Extractor(BaseModule):
|
|||
logger.warning(f"Failed to fetch the Media URL: {e}")
|
||||
|
||||
@abstractmethod
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
def download(self, item: Metadata) -> Metadata | False:
|
||||
"""
|
||||
Downloads the media from the given URL and returns a Metadata object with the downloaded media.
|
||||
|
||||
If the URL is not supported or the download fails, this method should return False.
|
||||
|
||||
"""
|
||||
pass
|
|
@ -1,11 +1,8 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import BaseModule
|
||||
|
||||
|
||||
@dataclass
|
||||
class Feeder(BaseModule):
|
||||
|
||||
@abstractmethod
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, Media, BaseModule
|
||||
|
||||
|
||||
@dataclass
|
||||
class Formatter(BaseModule):
|
||||
|
||||
@abstractmethod
|
||||
|
|
|
@ -11,8 +11,6 @@ from dataclasses import dataclass, field
|
|||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
@ -36,12 +34,11 @@ class Media:
|
|||
_mimetype: str = None # eg: image/jpeg
|
||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||
|
||||
def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
|
||||
def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
|
||||
# 'Any' typing for metadata to avoid circular imports. Stores the media
|
||||
# into the provided/available storages [Storage] repeats the process for
|
||||
# its properties, in case they have inner media themselves for now it
|
||||
# only goes down 1 level but it's easy to make it recursive if needed.
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
if not len(storages):
|
||||
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||
return
|
||||
|
@ -66,8 +63,9 @@ class Media:
|
|||
for inner_media in prop_media.all_inner_media(include_self=True):
|
||||
yield inner_media
|
||||
|
||||
def is_stored(self) -> bool:
|
||||
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
|
||||
def is_stored(self, in_storage) -> bool:
|
||||
# checks if the media is already stored in the given storage
|
||||
return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
|
|
|
@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
|
|||
from loguru import logger
|
||||
|
||||
from .media import Media
|
||||
from .context import ArchivingContext
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
|
@ -32,6 +30,7 @@ class Metadata:
|
|||
|
||||
def __post_init__(self):
|
||||
self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
|
||||
self._context = {}
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
|
@ -45,6 +44,7 @@ class Metadata:
|
|||
if overwrite_left:
|
||||
if right.status and len(right.status):
|
||||
self.status = right.status
|
||||
self._context.update(right._context)
|
||||
for k, v in right.metadata.items():
|
||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||
|
@ -57,12 +57,11 @@ class Metadata:
|
|||
return right.merge(self)
|
||||
return self
|
||||
|
||||
def store(self: Metadata, override_storages: List = None):
|
||||
def store(self, storages=[]):
|
||||
# calls .store for all contained media. storages [Storage]
|
||||
self.remove_duplicate_media_by_hash()
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
for media in self.media:
|
||||
media.store(override_storages=storages, url=self.get_url(), metadata=self)
|
||||
media.store(url=self.get_url(), metadata=self, storages=storages)
|
||||
|
||||
def set(self, key: str, val: Any) -> Metadata:
|
||||
self.metadata[key] = val
|
||||
|
@ -206,3 +205,10 @@ class Metadata:
|
|||
if len(r.media) > len(most_complete.media): most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
||||
return most_complete
|
||||
|
||||
def set_context(self, key: str, val: Any) -> Metadata:
|
||||
self._context[key] = val
|
||||
return self
|
||||
|
||||
def get_context(self, key: str, default: Any = None) -> Any:
|
||||
return self._context.get(key, default)
|
|
@ -7,59 +7,70 @@ from __future__ import annotations
|
|||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
from abc import ABC
|
||||
import shutil
|
||||
import ast
|
||||
import copy
|
||||
import sys
|
||||
from importlib.util import find_spec
|
||||
import os
|
||||
from os.path import join, dirname
|
||||
from os.path import join
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
from .base_module import BaseModule
|
||||
|
||||
_LAZY_LOADED_MODULES = {}
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '',
|
||||
'author': 'Bellingcat',
|
||||
'type': [],
|
||||
'requires_setup': True,
|
||||
'description': '',
|
||||
'dependencies': {},
|
||||
'entry_point': '',
|
||||
'version': '1.0',
|
||||
'configs': {}
|
||||
}
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
config: dict
|
||||
name: str
|
||||
def setup_paths(paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
"""
|
||||
for path in paths:
|
||||
# check path exists, if it doesn't, log a warning
|
||||
if not os.path.exists(path):
|
||||
logger.warning(f"Path '{path}' does not exist. Skipping...")
|
||||
continue
|
||||
|
||||
def setup(self, config: dict):
|
||||
self.config = config
|
||||
for key, val in config.get(self.name, {}).items():
|
||||
setattr(self, key, val)
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
|
||||
def get_module(module_name: str, additional_paths: List[str] = []) -> LazyBaseModule:
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
|
||||
def get_module(module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
"""
|
||||
return get_module_lazy(module_name).load(config)
|
||||
|
||||
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
To load an actual module, call .setup() on a lazy module
|
||||
|
||||
"""
|
||||
if module_name in _LAZY_LOADED_MODULES:
|
||||
return _LAZY_LOADED_MODULES[module_name]
|
||||
|
||||
module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0]
|
||||
_LAZY_LOADED_MODULES[module_name] = module
|
||||
return module
|
||||
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
return available[0]
|
||||
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
|
@ -67,10 +78,9 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
|
|||
if os.path.isfile(join(module_path, MANIFEST_FILE)):
|
||||
return True
|
||||
|
||||
default_path = [join(dirname(dirname((__file__))), "modules")]
|
||||
all_modules = []
|
||||
|
||||
for module_folder in default_path + additional_paths:
|
||||
for module_folder in auto_archiver.modules.__path__:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
try:
|
||||
possible_modules = os.listdir(module_folder)
|
||||
|
@ -85,8 +95,13 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
|
|||
possible_module_path = join(module_folder, possible_module)
|
||||
if not is_really_module(possible_module_path):
|
||||
continue
|
||||
|
||||
all_modules.append(LazyBaseModule(possible_module, possible_module_path))
|
||||
if _LAZY_LOADED_MODULES.get(possible_module):
|
||||
continue
|
||||
lazy_module = LazyBaseModule(possible_module, possible_module_path)
|
||||
|
||||
_LAZY_LOADED_MODULES[possible_module] = lazy_module
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
|
@ -97,8 +112,14 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
|
|||
|
||||
@dataclass
|
||||
class LazyBaseModule:
|
||||
|
||||
"""
|
||||
A lazy module class, which only loads the manifest and does not load the module itself.
|
||||
|
||||
This is useful for getting information about a module without actually loading it.
|
||||
|
||||
"""
|
||||
name: str
|
||||
display_name: str
|
||||
type: list
|
||||
description: str
|
||||
path: str
|
||||
|
@ -129,6 +150,10 @@ class LazyBaseModule:
|
|||
@property
|
||||
def requires_setup(self) -> bool:
|
||||
return self.manifest['requires_setup']
|
||||
|
||||
@property
|
||||
def display_name(self) -> str:
|
||||
return self.manifest['name']
|
||||
|
||||
@property
|
||||
def manifest(self) -> dict:
|
||||
|
@ -136,7 +161,7 @@ class LazyBaseModule:
|
|||
return self._manifest
|
||||
# print(f"Loading manifest for module {module_path}")
|
||||
# load the manifest file
|
||||
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
|
||||
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
|
||||
|
||||
with open(join(self.path, MANIFEST_FILE)) as f:
|
||||
try:
|
||||
|
@ -145,7 +170,6 @@ class LazyBaseModule:
|
|||
logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
|
||||
self._manifest = manifest
|
||||
self.display_name = manifest['name']
|
||||
self.type = manifest['type']
|
||||
self._entry_point = manifest['entry_point']
|
||||
self.description = manifest['description']
|
||||
|
@ -153,7 +177,7 @@ class LazyBaseModule:
|
|||
|
||||
return manifest
|
||||
|
||||
def load(self) -> BaseModule:
|
||||
def load(self, config) -> BaseModule:
|
||||
|
||||
if self._instance:
|
||||
return self._instance
|
||||
|
@ -161,11 +185,31 @@ class LazyBaseModule:
|
|||
# check external dependencies are installed
|
||||
def check_deps(deps, check):
|
||||
for dep in deps:
|
||||
if not len(dep):
|
||||
# clear out any empty strings that a user may have erroneously added
|
||||
continue
|
||||
if not check(dep):
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
exit(1)
|
||||
|
||||
check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep))
|
||||
def check_python_dep(dep):
|
||||
# first check if it's a module:
|
||||
try:
|
||||
m = get_module_lazy(dep, suppress_warnings=True)
|
||||
try:
|
||||
# we must now load this module and set it up with the config
|
||||
m.load(config)
|
||||
return True
|
||||
except:
|
||||
logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
|
||||
return False
|
||||
except IndexError:
|
||||
# not a module, continue
|
||||
pass
|
||||
|
||||
return find_spec(dep)
|
||||
|
||||
check_deps(self.dependencies.get('python', []), check_python_dep)
|
||||
check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
|
||||
|
||||
|
||||
|
@ -184,9 +228,8 @@ class LazyBaseModule:
|
|||
sub_qualname = f'{qualname}.{file_name}'
|
||||
|
||||
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
||||
|
||||
# finally, get the class instance
|
||||
instance = getattr(sys.modules[sub_qualname], class_name)()
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
if not getattr(instance, 'name', None):
|
||||
instance.name = self.name
|
||||
|
||||
|
@ -194,6 +237,12 @@ class LazyBaseModule:
|
|||
instance.display_name = self.display_name
|
||||
|
||||
self._instance = instance
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.config_setup(config)
|
||||
instance.setup()
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
|
|
|
@ -5,30 +5,61 @@
|
|||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List
|
||||
from typing import Generator, Union, List, Type
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from tempfile import TemporaryDirectory
|
||||
import traceback
|
||||
|
||||
from rich_argparse import RichHelpFormatter
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from .metadata import Metadata
|
||||
from ..version import __version__
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module
|
||||
from . import validators
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .module import BaseModule
|
||||
|
||||
import tempfile, traceback
|
||||
from loguru import logger
|
||||
|
||||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
|
||||
class JsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
try:
|
||||
setattr(namespace, self.dest, json.loads(values))
|
||||
except json.JSONDecodeError as e:
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
||||
|
||||
|
||||
class AuthenticationJsonParseAction(JsonParseAction):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
super().__call__(parser, namespace, values, option_string)
|
||||
auth_dict = getattr(namespace, self.dest)
|
||||
if isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
try:
|
||||
with open(auth_dict, 'r') as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
except:
|
||||
pass
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
for site, auth in auth_dict.items():
|
||||
if not isinstance(site, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
if not hasattr(namespace, self.dest):
|
||||
|
@ -39,10 +70,16 @@ class UniqueAppendAction(argparse.Action):
|
|||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
_do_not_store_keys = []
|
||||
|
||||
feeders: List[Type[Feeder]]
|
||||
extractors: List[Type[Extractor]]
|
||||
enrichers: List[Type[Enricher]]
|
||||
databases: List[Type[Database]]
|
||||
storages: List[Type[Storage]]
|
||||
formatters: List[Type[Formatter]]
|
||||
|
||||
def setup_basic_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="auto-archiver",
|
||||
add_help=False,
|
||||
description="""
|
||||
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
|
||||
|
@ -51,14 +88,16 @@ class ArchivingOrchestrator:
|
|||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
parser = DefaultValidatingParser(
|
||||
|
@ -76,18 +115,22 @@ class ArchivingOrchestrator:
|
|||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for config in [yaml_config['steps'], basic_config.__dict__]:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
enabled_modules.extend(config.get(f"{module_type}s", []))
|
||||
|
||||
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
|
||||
for module_type in MODULE_TYPES:
|
||||
if modules := getattr(basic_config, f"{module_type}s", []):
|
||||
enabled_modules.extend(modules)
|
||||
|
||||
self.add_module_args(available_modules(with_manifest=True, limit_to_modules=set(enabled_modules), suppress_warnings=True), parser)
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
self.add_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
|
||||
self.add_module_args(simple_modules, parser)
|
||||
|
||||
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
||||
yaml_config['steps']['feeders'] = ['cli_feeder']
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
|
@ -115,7 +158,7 @@ class ArchivingOrchestrator:
|
|||
|
||||
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
|
||||
logger.info(f"Storing configuration file to {basic_config.config_file}")
|
||||
store_yaml(self.config, basic_config.config_file, self._do_not_store_keys)
|
||||
store_yaml(self.config, basic_config.config_file)
|
||||
|
||||
return self.config
|
||||
|
||||
|
@ -123,28 +166,37 @@ class ArchivingOrchestrator:
|
|||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', help='the feeders to use', action=UniqueAppendAction)
|
||||
|
||||
# allow passing URLs directly on the command line
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
|
||||
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',\
|
||||
default={},
|
||||
action=AuthenticationJsonParseAction)
|
||||
# logging arguments
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO')
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
# additional modules
|
||||
parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
|
||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None):
|
||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
|
||||
if not modules:
|
||||
modules = available_modules(with_manifest=True)
|
||||
|
||||
module: LazyBaseModule
|
||||
for module in modules:
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
|
@ -153,54 +205,54 @@ class ArchivingOrchestrator:
|
|||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
|
||||
for name, kwargs in module.configs.items():
|
||||
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
|
||||
# in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
|
||||
do_not_store = kwargs.pop('do_not_store', False)
|
||||
if do_not_store:
|
||||
self._do_not_store_keys.append((module.name, name))
|
||||
|
||||
if not kwargs.get('metavar', None):
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs['metavar'] = name.upper()
|
||||
|
||||
if kwargs.get('required', False):
|
||||
# required args shouldn't have a 'default' value, remove it
|
||||
kwargs.pop('default', None)
|
||||
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
try:
|
||||
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
|
||||
except AttributeError:
|
||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||
except KeyError:
|
||||
kwargs['type'] = getattr(validators, kwargs['type'])
|
||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
arg.should_store = should_store
|
||||
|
||||
def show_help(self):
|
||||
def show_help(self, basic_config: dict):
|
||||
# for the help message, we want to load *all* possible modules and show the help
|
||||
# add configs as arg parser arguments
|
||||
|
||||
self.add_additional_args(self.basic_parser)
|
||||
self.add_module_args(parser=self.basic_parser)
|
||||
|
||||
self.basic_parser.print_help()
|
||||
exit()
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self):
|
||||
# setup loguru logging
|
||||
logger.remove() # remove the default logger
|
||||
logger.remove(0) # remove the default logger
|
||||
logging_config = self.config['logging']
|
||||
logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
|
||||
|
||||
def install_modules(self):
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
Swaps out the previous 'strings' in the config with the actual modules
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = self.config['steps'][f"{module_type}s"]
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
|
@ -214,14 +266,37 @@ class ArchivingOrchestrator:
|
|||
exit()
|
||||
|
||||
for module in modules_to_load:
|
||||
if module == 'cli_feeder':
|
||||
# pseudo module, don't load it
|
||||
urls = self.config['urls']
|
||||
if not urls:
|
||||
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
exit()
|
||||
# cli_feeder is a pseudo module, it just takes the command line args
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
yield Metadata().set_url(url)
|
||||
|
||||
pseudo_module = type('CLIFeeder', (Feeder,), {
|
||||
'name': 'cli_feeder',
|
||||
'display_name': 'CLI Feeder',
|
||||
'__iter__': feed
|
||||
|
||||
})()
|
||||
|
||||
|
||||
pseudo_module.__iter__ = feed
|
||||
step_items.append(pseudo_module)
|
||||
continue
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
loaded_module: BaseModule = get_module(module).load()
|
||||
try:
|
||||
loaded_module.setup(self.config)
|
||||
loaded_module: BaseModule = get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor':
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor' and loaded_module.name == module:
|
||||
loaded_module.cleanup()
|
||||
exit()
|
||||
|
||||
|
@ -230,59 +305,58 @@ class ArchivingOrchestrator:
|
|||
continue
|
||||
if loaded_module:
|
||||
step_items.append(loaded_module)
|
||||
# TODO temp solution
|
||||
if module_type == "storage":
|
||||
ArchivingContext.set("storages", step_items, keep_on_reset=True)
|
||||
|
||||
check_steps_ok()
|
||||
self.config['steps'][f"{module_type}s"] = step_items
|
||||
|
||||
setattr(self, f"{module_type}s", step_items)
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
|
||||
assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again."
|
||||
self.config['steps'][f"{module_type}s"] = step_items
|
||||
return read_yaml(config_file)
|
||||
|
||||
def run(self) -> None:
|
||||
def run(self, args: list) -> None:
|
||||
|
||||
self.setup_basic_parser()
|
||||
|
||||
# parse the known arguments for now (basically, we want the config file)
|
||||
basic_config, unused_args = self.basic_parser.parse_known_args(args)
|
||||
|
||||
# load the config file to get the list of enabled items
|
||||
basic_config, unused_args = self.basic_parser.parse_known_args()
|
||||
# setup any custom module paths, so they'll show in the help and for arg parsing
|
||||
setup_paths(basic_config.module_paths)
|
||||
|
||||
# if help flag was called, then show the help
|
||||
if basic_config.help:
|
||||
self.show_help()
|
||||
self.show_help(basic_config)
|
||||
|
||||
# load the config file
|
||||
yaml_config = {}
|
||||
|
||||
if not os.path.exists(basic_config.config_file) and basic_config.config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
|
||||
|
||||
yaml_config = read_yaml(basic_config.config_file)
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules()
|
||||
self.install_modules(self.config['steps'])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"]))
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
for item in self.feed():
|
||||
for _ in self.feed():
|
||||
pass
|
||||
|
||||
def cleanup(self)->None:
|
||||
logger.info("Cleaning up")
|
||||
for e in self.config['steps']['extractors']:
|
||||
for e in self.extractors:
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for feeder in self.config['steps']['feeders']:
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
yield self.feed_item(item)
|
||||
url_count += 1
|
||||
|
||||
logger.success(f"Processed {url_count} URL(s)")
|
||||
self.cleanup()
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
|
@ -291,22 +365,33 @@ class ArchivingOrchestrator:
|
|||
- catches keyboard interruptions to do a clean exit
|
||||
- catches any unexpected error, logs it, and does a clean exit
|
||||
"""
|
||||
tmp_dir: TemporaryDirectory = None
|
||||
try:
|
||||
ArchivingContext.reset()
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
ArchivingContext.set_tmp_dir(tmp_dir)
|
||||
return self.archive(item)
|
||||
tmp_dir = TemporaryDirectory(dir="./")
|
||||
# set tmp_dir on all modules
|
||||
for m in self.all_modules:
|
||||
m.tmp_dir = tmp_dir.name
|
||||
return self.archive(item)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {item=}")
|
||||
for d in self.config['steps']['databases']: d.aborted(item)
|
||||
for d in self.databases:
|
||||
d.aborted(item)
|
||||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
for d in self.config['steps']['databases']:
|
||||
if type(e) == AssertionError: d.failed(item, str(e))
|
||||
else: d.failed(item, reason="unexpected error")
|
||||
for d in self.databases:
|
||||
if type(e) == AssertionError:
|
||||
d.failed(item, str(e))
|
||||
else:
|
||||
d.failed(item, reason="unexpected error")
|
||||
finally:
|
||||
if tmp_dir:
|
||||
# remove the tmp_dir from all modules
|
||||
for m in self.all_modules:
|
||||
m.tmp_dir = None
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
|
@ -319,31 +404,38 @@ class ArchivingOrchestrator:
|
|||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
self.assert_valid_url(original_url)
|
||||
try:
|
||||
self.assert_valid_url(original_url)
|
||||
except AssertionError as e:
|
||||
logger.error(f"Error archiving URL {original_url}: {e}")
|
||||
raise e
|
||||
|
||||
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
|
||||
url = original_url
|
||||
for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url)
|
||||
for a in self.extractors:
|
||||
url = a.sanitize_url(url)
|
||||
|
||||
result.set_url(url)
|
||||
if original_url != url: result.set("original_url", original_url)
|
||||
|
||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||
cached_result = None
|
||||
for d in self.config["steps"]["databases"]:
|
||||
for d in self.databases:
|
||||
d.started(result)
|
||||
if (local_result := d.fetch(result)):
|
||||
cached_result = (cached_result or Metadata()).merge(local_result)
|
||||
if local_result := d.fetch(result):
|
||||
cached_result = (cached_result or Metadata()).merge(local_result).merge(result)
|
||||
if cached_result:
|
||||
logger.debug("Found previously archived entry")
|
||||
for d in self.config["steps"]["databases"]:
|
||||
for d in self.databases:
|
||||
try: d.done(cached_result, cached=True)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
return cached_result
|
||||
|
||||
# 3 - call extractors until one succeeds
|
||||
for a in self.config["steps"]["extractors"]:
|
||||
for a in self.extractors:
|
||||
logger.info(f"Trying extractor {a.name} for {url}")
|
||||
try:
|
||||
result.merge(a.download(result))
|
||||
|
@ -352,24 +444,25 @@ class ArchivingOrchestrator:
|
|||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# 4 - call enrichers to work with archived content
|
||||
for e in self.config["steps"]["enrichers"]:
|
||||
for e in self.enrichers:
|
||||
try: e.enrich(result)
|
||||
except Exception as exc:
|
||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
# 5 - store all downloaded/generated media
|
||||
result.store()
|
||||
result.store(storages=self.storages)
|
||||
|
||||
# 6 - format and store formatted if needed
|
||||
if final_media := self.config["steps"]["formatters"][0].format(result):
|
||||
final_media.store(url=url, metadata=result)
|
||||
final_media: Media
|
||||
if final_media := self.formatters[0].format(result):
|
||||
final_media.store(url=url, metadata=result, storages=self.storages)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
if result.is_empty():
|
||||
result.status = "nothing archived"
|
||||
|
||||
# signal completion to databases and archivers
|
||||
for d in self.config["steps"]["databases"]:
|
||||
for d in self.databases:
|
||||
try: d.done(result)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
@ -394,4 +487,11 @@ class ArchivingOrchestrator:
|
|||
assert ip.is_global, f"Invalid IP used"
|
||||
assert not ip.is_reserved, f"Invalid IP used"
|
||||
assert not ip.is_link_local, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@property
|
||||
def all_modules(self) -> List[Type[BaseModule]]:
|
||||
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
|
@ -1,25 +1,23 @@
|
|||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import IO, Optional
|
||||
from typing import IO
|
||||
import os
|
||||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
@dataclass
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
from auto_archiver.core.module import get_module
|
||||
class Storage(BaseModule):
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
|
||||
if media.is_stored():
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
self.set_key(media, url)
|
||||
self.set_key(media, url, metadata)
|
||||
self.upload(media, metadata=metadata)
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
||||
|
@ -30,34 +28,35 @@ class Storage(BaseModule):
|
|||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url) -> None:
|
||||
def set_key(self, media: Media, url, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = ArchivingContext.get("folder", "")
|
||||
folder = metadata.get_context('folder', '')
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
path_generator = ArchivingContext.get("path_generator", "url")
|
||||
path_generator = self.config.get("path_generator", "url")
|
||||
if path_generator == "flat":
|
||||
path = ""
|
||||
filename = slugify(filename) # Ensure filename is slugified
|
||||
elif path_generator == "url":
|
||||
path = slugify(url)
|
||||
elif path_generator == "random":
|
||||
path = ArchivingContext.get("random_path", random_str(24), True)
|
||||
path = self.config.get("random_path", random_str(24), True)
|
||||
else:
|
||||
raise ValueError(f"Invalid path_generator: {path_generator}")
|
||||
|
||||
# Handle filename_generator logic
|
||||
filename_generator = ArchivingContext.get("filename_generator", "random")
|
||||
filename_generator = self.config.get("filename_generator", "random")
|
||||
if filename_generator == "random":
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
# load the hash_enricher module
|
||||
he = get_module(HashEnricher, self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
|
|
|
@ -1,7 +1,19 @@
|
|||
# used as validators for config values.
|
||||
# used as validators for config values. Should raise an exception if the value is invalid.
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
def example_validator(value):
|
||||
return "example" in value
|
||||
if "example" not in value:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
|
||||
return value
|
||||
|
||||
def positive_number(value):
|
||||
return value > 0
|
||||
if value < 0:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
||||
return value
|
||||
|
||||
|
||||
def valid_file(value):
|
||||
if not Path(value).is_file():
|
||||
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
|
||||
return value
|
|
@ -1 +1 @@
|
|||
from api_db import AAApiDb
|
||||
from .api_db import AAApiDb
|
|
@ -1,28 +1,49 @@
|
|||
{
|
||||
"name": "Auto-Archiver API Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "api_db:AAApiDb",
|
||||
"entry_point": "api_db::AAApiDb",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["requests",
|
||||
"loguru"],
|
||||
"dependencies": {
|
||||
"python": ["requests", "loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_token": {"default": None, "help": "API Bearer token."},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL",}
|
||||
"api_endpoint": {
|
||||
"required": True,
|
||||
"help": "API endpoint where calls are made to",
|
||||
},
|
||||
"api_token": {"default": None,
|
||||
"help": "API Bearer token."},
|
||||
"public": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "whether the URL should be publicly available via the API",
|
||||
},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {
|
||||
"default": None,
|
||||
"help": "which group of users have access to the archive in case public=false as author",
|
||||
},
|
||||
"use_api_cache": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
|
||||
},
|
||||
"store_results": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "when set, will send the results to the API database.",
|
||||
},
|
||||
"tags": {
|
||||
"default": [],
|
||||
"help": "what tags to add to the archived URL",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Provides integration with the Auto-Archiver API for querying and storing archival data.
|
||||
|
||||
### Features
|
||||
- **API Integration**: Supports querying for existing archives and submitting results.
|
||||
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
|
||||
- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
|
||||
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
|
||||
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
|
||||
- **Optional Storage**: Archives results conditionally based on configuration.
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from typing import Union
|
||||
import requests, os
|
||||
|
||||
import os
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
|
@ -7,27 +9,17 @@ from auto_archiver.core import Metadata
|
|||
|
||||
|
||||
class AAApiDb(Database):
|
||||
"""
|
||||
Connects to auto-archiver-api instance
|
||||
"""
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.allow_rearchive = bool(self.allow_rearchive)
|
||||
self.store_results = bool(self.store_results)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
|
||||
"""Connects to auto-archiver-api instance"""
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""
|
||||
if not self.allow_rearchive: return
|
||||
|
||||
if not self.use_api_cache: return
|
||||
|
||||
params = {"url": item.get_url(), "limit": 15}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
|
||||
response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
if len(response.json()):
|
||||
|
@ -38,21 +30,26 @@ class AAApiDb(Database):
|
|||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
return False
|
||||
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
if not self.store_results: return
|
||||
if cached:
|
||||
if cached:
|
||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||
return
|
||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||
payload = {
|
||||
'author_id': self.author_id,
|
||||
'url': item.get_url(),
|
||||
'public': self.public,
|
||||
'group_id': self.group_id,
|
||||
'tags': list(self.tags),
|
||||
'result': item.to_json(),
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
||||
response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
if response.status_code == 201:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
from .atlos import AtlosStorage
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"name": "atlos_storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {"python": ["loguru", "requests"], "bin": [""]},
|
||||
"configs": {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
},
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"type": "str",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosStorage: A storage module for saving media files to the Atlos platform.
|
||||
|
||||
### Features
|
||||
- Uploads media files to Atlos using Atlos-specific APIs.
|
||||
- Automatically calculates SHA-256 hashes of media files for integrity verification.
|
||||
- Skips uploads for files that already exist on Atlos with the same hash.
|
||||
- Supports attaching metadata, such as `atlos_id`, to the uploaded files.
|
||||
- Provides CDN-like URLs for accessing uploaded media.
|
||||
|
||||
### Notes
|
||||
- Requires Atlos API configuration, including `atlos_url` and `api_token`.
|
||||
- Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
|
||||
""",
|
||||
}
|
|
@ -1,9 +1,9 @@
|
|||
{
|
||||
"name": "Atlos Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "atlos_db:AtlosDb",
|
||||
"entry_point": "atlos_db::AtlosDb",
|
||||
"requires_setup": True,
|
||||
"external_dependencies":
|
||||
"dependencies":
|
||||
{"python": ["loguru",
|
||||
""],
|
||||
"bin": [""]},
|
||||
|
|
|
@ -1,14 +1,10 @@
|
|||
import os
|
||||
|
||||
from typing import Union
|
||||
from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosDb(Database):
|
||||
|
|
|
@ -2,14 +2,14 @@
|
|||
"name": "Atlos Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"type": "str",
|
||||
"required": True,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
|
|
|
@ -1,19 +1,12 @@
|
|||
from loguru import logger
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AtlosFeeder(Feeder):
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
if type(self.api_token) != str:
|
||||
raise Exception("Atlos Feeder did not receive an Atlos API token")
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
# Get all the urls from the Atlos API
|
||||
count = 0
|
||||
|
@ -47,5 +40,3 @@ class AtlosFeeder(Feeder):
|
|||
|
||||
if len(data["results"]) == 0 or cursor is None:
|
||||
break
|
||||
|
||||
logger.success(f"Processed {count} URL(s)")
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import os
|
||||
from typing import IO, List, Optional
|
||||
from loguru import logger
|
||||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
from typing import IO, Optional
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.core import Storage
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosStorage(Storage):
|
|
@ -1 +0,0 @@
|
|||
from .cli_feeder import CLIFeeder
|
|
@ -1,27 +0,0 @@
|
|||
{
|
||||
"name": "CLI Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
"configs": {
|
||||
"urls": {
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"nargs": "+",
|
||||
"required": True,
|
||||
"do_not_store": True,
|
||||
"metavar": "INPUT URLS",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
|
||||
|
||||
### Features
|
||||
- Takes a single URL or a list of URLs provided via the command line.
|
||||
- Converts each URL into a `Metadata` object and yields it for processing.
|
||||
- Ensures URLs are processed only if they are explicitly provided.
|
||||
|
||||
"""
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
|
||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Console Database",
|
||||
"type": ["database"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "CSV Database",
|
||||
"type": ["database"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {"python": ["loguru"]
|
||||
"dependencies": {"python": ["loguru"]
|
||||
},
|
||||
'entry_point': 'csv_db::CSVDb',
|
||||
"configs": {
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "CSV Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": [""]
|
||||
},
|
||||
|
@ -13,6 +13,9 @@
|
|||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"required": True,
|
||||
"type": "valid_file",
|
||||
"nargs": "+",
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
|
@ -26,9 +29,9 @@
|
|||
- Supports reading URLs from multiple input files, specified as a comma-separated list.
|
||||
- Allows specifying the column number or name to extract URLs from.
|
||||
- Skips header rows if the first value is not a valid URL.
|
||||
- Integrates with the `ArchivingContext` to manage URL feeding.
|
||||
|
||||
### Setu N
|
||||
- Input files should be formatted with one URL per line.
|
||||
### Setup
|
||||
- Input files should be formatted with one URL per line, with or without a header row.
|
||||
- If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
|
||||
"""
|
||||
}
|
||||
|
|
|
@ -2,24 +2,37 @@ from loguru import logger
|
|||
import csv
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import url_or_none
|
||||
|
||||
class CSVFeeder(Feeder):
|
||||
|
||||
column = None
|
||||
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
url_column = self.column or 0
|
||||
for file in self.files:
|
||||
with open(file, "r") as f:
|
||||
reader = csv.reader(f)
|
||||
first_row = next(reader)
|
||||
if not(url_or_none(first_row[url_column])):
|
||||
# it's a header row, skip it
|
||||
url_column = self.column or 0
|
||||
if isinstance(url_column, str):
|
||||
try:
|
||||
url_column = first_row.index(url_column)
|
||||
except ValueError:
|
||||
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
|
||||
return
|
||||
elif not(url_or_none(first_row[url_column])):
|
||||
# it's a header row, but we've been given a column number already
|
||||
logger.debug(f"Skipping header row: {first_row}")
|
||||
for row in reader:
|
||||
url = row[0]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
else:
|
||||
# first row isn't a header row, rewind the file
|
||||
f.seek(0)
|
||||
|
||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
||||
for row in reader:
|
||||
if not url_or_none(row[url_column]):
|
||||
logger.warning(f"Not a valid URL in row: {row}, skipping")
|
||||
continue
|
||||
url = row[url_column]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
|
@ -1,14 +1,14 @@
|
|||
{
|
||||
"name": "Google Drive Storage",
|
||||
"type": ["storage"],
|
||||
"author": "Dave Mateer",
|
||||
"entry_point": "gdrive_storage::GDriveStorage",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"google-api-python-client",
|
||||
"google-auth",
|
||||
"google-auth-oauthlib",
|
||||
"google-auth-httplib2"
|
||||
"googleapiclient",
|
||||
"google",
|
||||
],
|
||||
},
|
||||
"configs": {
|
||||
|
@ -18,17 +18,23 @@
|
|||
"choices": ["flat", "url", "random"],
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"root_folder_id": {"required": True,
|
||||
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None,
|
||||
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
|
||||
},
|
||||
"description": """
|
||||
|
||||
GDriveStorage: A storage module for saving archived content to Google Drive.
|
||||
|
||||
Author: Dave Mateer, (And maintained by: )
|
||||
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
|
||||
|
||||
### Features
|
||||
- Saves media files to Google Drive, organizing them into folders based on the provided path structure.
|
||||
- Supports OAuth token-based authentication or service account credentials for API access.
|
||||
|
@ -39,5 +45,55 @@
|
|||
- Requires setup with either a Google OAuth token or a service account JSON file.
|
||||
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
|
||||
- Automatically handles Google Drive API token refreshes for long-running jobs.
|
||||
"""
|
||||
|
||||
## Overview
|
||||
This module integrates Google Drive as a storage backend, enabling automatic folder creation and file uploads. It supports authentication via **service accounts** (recommended for automation) or **OAuth tokens** (for user-based authentication).
|
||||
|
||||
## Features
|
||||
- Saves files to Google Drive, organizing them into structured folders.
|
||||
- Supports both **service account** and **OAuth token** authentication.
|
||||
- Automatically creates folders if they don't exist.
|
||||
- Generates public URLs for easy file sharing.
|
||||
|
||||
## Setup Guide
|
||||
1. **Enable Google Drive API**
|
||||
- Create a Google Cloud project at [Google Cloud Console](https://console.cloud.google.com/)
|
||||
- Enable the **Google Drive API**.
|
||||
|
||||
2. **Set Up a Google Drive Folder**
|
||||
- Create a folder in **Google Drive** and copy its **folder ID** from the URL.
|
||||
- Add the **folder ID** to your configuration (`orchestration.yaml`):
|
||||
```yaml
|
||||
root_folder_id: "FOLDER_ID"
|
||||
```
|
||||
|
||||
3. **Authentication Options**
|
||||
- **Option 1: Service Account (Recommended)**
|
||||
- Create a **service account** in Google Cloud IAM.
|
||||
- Download the JSON key file and save it as:
|
||||
```
|
||||
secrets/service_account.json
|
||||
```
|
||||
- **Share your Drive folder** with the service account’s `client_email` (found in the JSON file).
|
||||
|
||||
- **Option 2: OAuth Token (User Authentication)**
|
||||
- Create OAuth **Desktop App credentials** in Google Cloud.
|
||||
- Save the credentials as:
|
||||
```
|
||||
secrets/oauth_credentials.json
|
||||
```
|
||||
- Generate an OAuth token by running:
|
||||
```sh
|
||||
python scripts/create_update_gdrive_oauth_token.py -c secrets/oauth_credentials.json
|
||||
```
|
||||
|
||||
|
||||
Notes on the OAuth token:
|
||||
Tokens are refreshed after 1 hour however keep working for 7 days (tbc)
|
||||
so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work
|
||||
see this link for details on the token:
|
||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
|
||||
|
||||
"""
|
||||
}
|
||||
|
|
|
@ -1,68 +1,67 @@
|
|||
|
||||
import shutil, os, time, json
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import IO
|
||||
from loguru import logger
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2 import service_account
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google.auth.transport.requests import Request
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
|
||||
|
||||
|
||||
class GDriveStorage(Storage):
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
def setup(self) -> None:
|
||||
self.scopes = ['https://www.googleapis.com/auth/drive']
|
||||
# Initialize Google Drive service
|
||||
self._setup_google_drive_service()
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
|
||||
if self.oauth_token is not None:
|
||||
"""
|
||||
Tokens are refreshed after 1 hour
|
||||
however keep working for 7 days (tbc)
|
||||
so as long as the job doesn't last for 7 days
|
||||
then this method of refreshing only once per run will work
|
||||
see this link for details on the token
|
||||
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
|
||||
"""
|
||||
logger.debug(f'Using GD OAuth token {self.oauth_token}')
|
||||
# workaround for missing 'refresh_token' in from_authorized_user_file
|
||||
with open(self.oauth_token, 'r') as stream:
|
||||
creds_json = json.load(stream)
|
||||
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
|
||||
creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
|
||||
# creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES)
|
||||
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
logger.debug('Requesting new GD OAuth token')
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
raise Exception("Problem with creds - create the token again")
|
||||
|
||||
# Save the credentials for the next run
|
||||
with open(self.oauth_token, 'w') as token:
|
||||
logger.debug('Saving new GD OAuth token')
|
||||
token.write(creds.to_json())
|
||||
else:
|
||||
logger.debug('GD OAuth Token valid')
|
||||
def _setup_google_drive_service(self):
|
||||
"""Initialize Google Drive service based on provided credentials."""
|
||||
if self.oauth_token:
|
||||
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
|
||||
self.service = self._initialize_with_oauth_token()
|
||||
elif self.service_account:
|
||||
logger.debug(f"Using Google Drive service account: {self.service_account}")
|
||||
self.service = self._initialize_with_service_account()
|
||||
else:
|
||||
gd_service_account = self.service_account
|
||||
logger.debug(f'Using GD Service Account {gd_service_account}')
|
||||
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
|
||||
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
|
||||
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
def _initialize_with_oauth_token(self):
|
||||
"""Initialize Google Drive service with OAuth token."""
|
||||
with open(self.oauth_token, 'r') as stream:
|
||||
creds_json = json.load(stream)
|
||||
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
|
||||
|
||||
creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
|
||||
if not creds.valid and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
with open(self.oauth_token, 'w') as token_file:
|
||||
logger.debug("Saving refreshed OAuth token.")
|
||||
token_file.write(creds.to_json())
|
||||
elif not creds.valid:
|
||||
raise ValueError("Invalid OAuth token. Please regenerate the token.")
|
||||
|
||||
return build('drive', 'v3', credentials=creds)
|
||||
|
||||
def _initialize_with_service_account(self):
|
||||
"""Initialize Google Drive service with service account."""
|
||||
creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
|
||||
return build('drive', 'v3', credentials=creds)
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
"""
|
||||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
|
||||
# full_name = os.path.join(self.folder, media.key)
|
||||
parent_id, folder_id = self.root_folder_id, None
|
||||
path_parts = media.key.split(os.path.sep)
|
||||
|
@ -71,13 +70,16 @@ class GDriveStorage(Storage):
|
|||
for folder in path_parts[0:-1]:
|
||||
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||
parent_id = folder_id
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
||||
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
|
||||
if not file_id:
|
||||
#
|
||||
logger.info(f"file {filename} not found in folder {folder_id}")
|
||||
return None
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
"""
|
||||
1. for each sub-folder in the path check if exists or create
|
||||
2. upload file to root_id/other_paths.../filename
|
||||
|
@ -105,7 +107,13 @@ class GDriveStorage(Storage):
|
|||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
|
||||
def _get_id_from_parent_and_name(self, parent_id: str,
|
||||
name: str,
|
||||
retries: int = 1,
|
||||
sleep_seconds: int = 10,
|
||||
use_mime_type: bool = False,
|
||||
raise_on_missing: bool = True,
|
||||
use_cache=False):
|
||||
"""
|
||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
|
@ -168,8 +176,3 @@ class GDriveStorage(Storage):
|
|||
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
|
||||
return gd_folder.get('id')
|
||||
|
||||
# def exists(self, key):
|
||||
# try:
|
||||
# self.get_cdn_url(key)
|
||||
# return True
|
||||
# except: return False
|
||||
|
|
|
@ -20,6 +20,7 @@ the broader archiving framework.
|
|||
- Retrieves metadata like titles, descriptions, upload dates, and durations.
|
||||
- Downloads subtitles and comments when enabled.
|
||||
- Configurable options for handling live streams, proxies, and more.
|
||||
- Supports authentication of websites using the 'authentication' settings from your orchestration.
|
||||
|
||||
### Dropins
|
||||
- For websites supported by `yt-dlp` that also contain posts in addition to videos
|
||||
|
@ -29,10 +30,6 @@ custom dropins can be created to handle additional websites and passed to the ar
|
|||
via the command line using the `--dropins` option (TODO!).
|
||||
""",
|
||||
"configs": {
|
||||
"facebook_cookie": {
|
||||
"default": None,
|
||||
"help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'",
|
||||
},
|
||||
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
|
||||
"comments": {
|
||||
"default": False,
|
||||
|
@ -67,14 +64,5 @@ via the command line using the `--dropins` option (TODO!).
|
|||
"default": "inf",
|
||||
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
|
||||
},
|
||||
"cookies_from_browser": {
|
||||
"default": None,
|
||||
"type": "str",
|
||||
"help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale",
|
||||
},
|
||||
"cookie_file": {
|
||||
"default": None,
|
||||
"help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
|
@ -23,19 +23,8 @@ class Bluesky(GenericDropin):
|
|||
|
||||
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
|
||||
# TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
|
||||
# handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
|
||||
# return ie_instance._extract_post(handle=handle, post_id=video_id)
|
||||
|
||||
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
|
||||
return ie_instance._download_json(
|
||||
'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
|
||||
video_id, query={
|
||||
'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
|
||||
'depth': 0,
|
||||
'parentHeight': 0,
|
||||
})['thread']['post']
|
||||
|
||||
|
||||
return ie_instance._extract_post(handle=handle, post_id=video_id)
|
||||
|
||||
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
|
||||
"""
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
from .dropin import GenericDropin
|
||||
|
||||
|
||||
class Facebook(GenericDropin):
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
video_id = ie_instance._match_valid_url(url).group('id')
|
||||
ie_instance._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
|
||||
|
||||
post_data = ie_instance._extract_from_url.extract_metadata(webpage)
|
||||
return post_data
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
metadata = archiver.create_metadata(url)
|
||||
metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
|
||||
return metadata
|
|
@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
@ -266,19 +266,30 @@ class GenericExtractor(Extractor):
|
|||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
if item.netloc in ['youtube.com', 'www.youtube.com']:
|
||||
if self.cookies_from_browser:
|
||||
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
|
||||
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
|
||||
elif self.cookie_file:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file}')
|
||||
ydl_options['cookiefile'] = self.cookie_file
|
||||
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
|
||||
'quiet': False, 'noplaylist': not self.allow_playlist ,
|
||||
'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
|
||||
"live_from_start": self.live_from_start, "proxy": self.proxy,
|
||||
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
logger.debug(f'Using provided auth username and password for {url}')
|
||||
ydl_options['username'] = auth['username']
|
||||
ydl_options['password'] = auth['password']
|
||||
elif 'cookie' in auth:
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookie_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
|
||||
ydl_options['cookiesfile'] = auth['cookies_file']
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from loguru import logger
|
|||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
"type": ["database"],
|
||||
"entry_point": "gsheet_db::GsheetsDb",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
|
@ -17,6 +17,7 @@
|
|||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
|
|
|
@ -1,39 +1,38 @@
|
|||
from typing import Union, Tuple
|
||||
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.utils.misc import get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f'Archive failed {reason}')
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, '')
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
|
@ -45,23 +44,25 @@ class GsheetsDb(Database):
|
|||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, 'status', status_message))
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", ""))
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
|
@ -70,34 +71,44 @@ class GsheetsDb(Database):
|
|||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if (thumbnail := item.get_first_image("thumbnail")):
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if (browsertrix := item.get_media_by_id("browsertrix")):
|
||||
batch_if_valid('wacz', "\n".join(browsertrix.urls))
|
||||
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', new_status)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||
if gsheet := ArchivingContext.get("gsheet"):
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
print(self.sheet_id)
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
"type": ["feeder"],
|
||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
|
|
|
@ -15,14 +15,13 @@ from loguru import logger
|
|||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.core import Metadata
|
||||
from . import GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Feeder):
|
||||
|
||||
def setup(self, config: dict):
|
||||
super().setup(config)
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
assert self.sheet or self.sheet_id, (
|
||||
|
@ -37,43 +36,48 @@ class GsheetsFeeder(Feeder):
|
|||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(wks.title):
|
||||
logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
|
||||
logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}')
|
||||
gw = GWorksheet(wks, header_row=self.header, columns=self.columns)
|
||||
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
||||
else:
|
||||
ArchivingContext.set("folder", folder, True)
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
yield m
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
# TODO: Check folder value not being recognised
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
else:
|
||||
m.set_context("folder", folder)
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Hash Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
|
|
|
@ -11,7 +11,8 @@ import hashlib
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils.misc import calculate_file_hash
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
|
@ -19,16 +20,6 @@ class HashEnricher(Enricher):
|
|||
Calculates hashes for Media instances
|
||||
"""
|
||||
|
||||
def __init__(self, config: dict = None):
|
||||
"""
|
||||
Initialize the HashEnricher with a configuration dictionary.
|
||||
"""
|
||||
super().__init__()
|
||||
# TODO set these from the manifest?
|
||||
# Set default values
|
||||
self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256"
|
||||
self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7)
|
||||
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
@ -39,15 +30,10 @@ class HashEnricher(Enricher):
|
|||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||
|
||||
def calculate_hash(self, filename) -> str:
|
||||
hash = None
|
||||
hash_algo = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256()
|
||||
hash_algo = hashlib.sha256
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512()
|
||||
hash_algo = hashlib.sha3_512
|
||||
else: return ""
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(self.chunksize)
|
||||
if not buf: break
|
||||
hash.update(buf)
|
||||
return hash.hexdigest()
|
||||
return calculate_file_hash(filename, hash_algo, self.chunksize)
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
"name": "HTML Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "jinja2"],
|
||||
"dependencies": {
|
||||
"python": ["hash_enricher", "loguru", "jinja2"],
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
import mimetypes, os, pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from urllib.parse import quote
|
||||
|
@ -8,20 +7,18 @@ import json
|
|||
import base64
|
||||
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlFormatter(Formatter):
|
||||
environment: Environment = None
|
||||
template: any = None
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
def setup(self) -> None:
|
||||
"""Sets up the Jinja2 environment and loads the template."""
|
||||
super().setup(config) # Ensure the base class logic is executed
|
||||
template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
|
||||
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
|
||||
|
||||
|
@ -48,12 +45,13 @@ class HtmlFormatter(Formatter):
|
|||
version=__version__
|
||||
)
|
||||
|
||||
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
|
||||
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
# get the already instantiated hash_enricher module
|
||||
he = get_module('hash_enricher', self.config)
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
{
|
||||
"name": "Instagram API Extractor",
|
||||
"type": ["extractor"],
|
||||
"external_dependencies":
|
||||
"entry_point": "instagram_api_extractor::InstagramAPIExtractor",
|
||||
"dependencies":
|
||||
{"python": ["requests",
|
||||
"loguru",
|
||||
"retrying",
|
||||
|
@ -9,24 +10,31 @@
|
|||
},
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"default": None, "help": "API endpoint to use"},
|
||||
"access_token": {"default": None,
|
||||
"help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"required": True,
|
||||
"help": "API endpoint to use"},
|
||||
"full_profile": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
"help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
|
||||
},
|
||||
"full_profile_max_posts": {
|
||||
"default": 0,
|
||||
"type": "int",
|
||||
"help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
|
||||
},
|
||||
"minimize_json_output": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if true, will remove empty values from the json output",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Archives various types of Instagram content using the Instagrapi API.
|
||||
|
||||
Requires setting up an Instagrapi API deployment and providing an access token and API endpoint.
|
||||
|
||||
### Features
|
||||
- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
|
||||
- Supports advanced configuration options, including:
|
||||
|
|
|
@ -28,20 +28,14 @@ class InstagramAPIExtractor(Extractor):
|
|||
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
|
||||
"""
|
||||
|
||||
global_pattern = re.compile(
|
||||
valid_url = re.compile(
|
||||
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
|
||||
)
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("access_token")
|
||||
self.assert_valid_string("api_endpoint")
|
||||
self.full_profile_max_posts = int(self.full_profile_max_posts)
|
||||
def setup(self) -> None:
|
||||
if self.api_endpoint[-1] == "/":
|
||||
self.api_endpoint = self.api_endpoint[:-1]
|
||||
|
||||
self.full_profile = bool(self.full_profile)
|
||||
self.minimize_json_output = bool(self.minimize_json_output)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
@ -49,7 +43,7 @@ class InstagramAPIExtractor(Extractor):
|
|||
url.replace("instagr.com", "instagram.com").replace(
|
||||
"instagr.am", "instagram.com"
|
||||
)
|
||||
insta_matches = self.global_pattern.findall(url)
|
||||
insta_matches = self.valid_url.findall(url)
|
||||
logger.info(f"{insta_matches=}")
|
||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||
return
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"name": "Instagram Extractor",
|
||||
"type": ["extractor"],
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"instaloader",
|
||||
"loguru",
|
||||
|
@ -9,9 +9,10 @@
|
|||
},
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"username": {"default": None, "help": "a valid Instagram username"},
|
||||
"username": {"required": True,
|
||||
"help": "a valid Instagram username"},
|
||||
"password": {
|
||||
"default": None,
|
||||
"required": True,
|
||||
"help": "the corresponding Instagram account password",
|
||||
},
|
||||
"download_folder": {
|
||||
|
@ -25,9 +26,11 @@
|
|||
# TODO: fine-grain
|
||||
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
|
||||
},
|
||||
"description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts
|
||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||
"description": """
|
||||
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
|
||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
highlights, and tagged posts.
|
||||
Authentication is required via username/password or a session file.
|
||||
|
||||
""",
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
"""
|
||||
import re, os, shutil, traceback
|
||||
import instaloader # https://instaloader.github.io/as-module.html
|
||||
import instaloader
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
|
@ -16,19 +16,17 @@ class InstagramExtractor(Extractor):
|
|||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
"""
|
||||
# NB: post regex should be tested before profile
|
||||
|
||||
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
|
||||
|
||||
# https://regex101.com/r/MGPquX/1
|
||||
post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
|
||||
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
|
||||
# https://regex101.com/r/6Wbsxa/1
|
||||
profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
|
||||
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
|
||||
# TODO: links to stories
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
# TODO: refactor how configuration validation is done
|
||||
self.assert_valid_string("username")
|
||||
self.assert_valid_string("password")
|
||||
self.assert_valid_string("download_folder")
|
||||
self.assert_valid_string("session_file")
|
||||
def setup(self) -> None:
|
||||
|
||||
self.insta = instaloader.Instaloader(
|
||||
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||
)
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
{
|
||||
"name": "Instagram Telegram Bot Extractor",
|
||||
"type": ["extractor"],
|
||||
"external_dependencies": {"python": ["loguru",
|
||||
"telethon",],
|
||||
"dependencies": {"python": ["loguru", "telethon",],
|
||||
},
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||
"timeout": {"default": 45,
|
||||
"type": "int",
|
||||
"help": "timeout to fetch the instagram content in seconds."},
|
||||
},
|
||||
"description": """
|
||||
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
|
||||
|
@ -28,6 +29,12 @@ returned as part of a `Metadata` object.
|
|||
To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
|
||||
- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
|
||||
- **Session File**: Optional path to store the Telegram session file for future use.
|
||||
|
||||
- The session file is created automatically and should be unique for each instance.
|
||||
- You may need to enter your Telegram credentials (phone) and use the a 2FA code sent to you the first time you run the extractor.:
|
||||
```2025-01-30 00:43:49.348 | INFO | auto_archiver.modules.instagram_tbot_extractor.instagram_tbot_extractor:setup:36 - SETUP instagram_tbot_extractor checking login...
|
||||
Please enter your phone (or bot token): +447123456789
|
||||
Please enter the code you received: 00000
|
||||
Signed in successfully as E C; remember to not break the ToS or you will risk an account ban!
|
||||
```
|
||||
""",
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ from loguru import logger
|
|||
from telethon.sync import TelegramClient
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
|
@ -33,17 +33,30 @@ class InstagramTbotExtractor(Extractor):
|
|||
2. checks if the session file is valid
|
||||
"""
|
||||
logger.info(f"SETUP {self.name} checking login...")
|
||||
self._prepare_session_file()
|
||||
self._initialize_telegram_client()
|
||||
|
||||
# make a copy of the session that is used exclusively with this archiver instance
|
||||
def _prepare_session_file(self):
|
||||
"""
|
||||
Creates a copy of the session file for exclusive use with this archiver instance.
|
||||
Ensures that a valid session file exists before proceeding.
|
||||
"""
|
||||
new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
|
||||
if not os.path.exists(f"{self.session_file}.session"):
|
||||
raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
|
||||
shutil.copy(self.session_file + ".session", new_session_file)
|
||||
self.session_file = new_session_file.replace(".session", "")
|
||||
|
||||
def _initialize_telegram_client(self):
|
||||
"""Initializes the Telegram client."""
|
||||
try:
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
except OperationalError as e:
|
||||
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||
|
||||
logger.error(
|
||||
f"Unable to access the {self.session_file} session. "
|
||||
"Ensure that you don't use the same session file here and in telethon_extractor. "
|
||||
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
|
||||
)
|
||||
with self.client.start():
|
||||
logger.success(f"SETUP {self.name} login works.")
|
||||
|
||||
|
@ -58,34 +71,51 @@ class InstagramTbotExtractor(Extractor):
|
|||
if not "instagram.com" in url: return False
|
||||
|
||||
result = Metadata()
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
with self.client.start():
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_id = self.client.send_message(entity=chat, message=url).id
|
||||
|
||||
attempts = 0
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
chat, since_id = self._send_url_to_bot(url)
|
||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||
|
||||
if "You must enter a URL to a post" in message:
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
# # TODO: It currently returns this as a success - is that intentional?
|
||||
# if "Media not found or unavailable" in message:
|
||||
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
# return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
return result.success("insta-via-bot")
|
||||
|
||||
def _send_url_to_bot(self, url: str):
|
||||
"""
|
||||
Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
|
||||
"""
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_message = self.client.send_message(entity=chat, message=url)
|
||||
return chat, since_message.id
|
||||
|
||||
def _process_messages(self, chat, since_id, tmp_dir, result):
|
||||
attempts = 0
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
# Skip known filler message:
|
||||
if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
|
||||
continue
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
return message.strip()
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Local Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Archive Metadata Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"description": """
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Media Metadata Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": ["exiftool"]
|
||||
},
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Mute Formatter",
|
||||
"type": ["formatter"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
},
|
||||
"description": """ Default formatter.
|
||||
""",
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
|
||||
|
||||
@dataclass
|
||||
class MuteFormatter(Formatter):
|
||||
|
||||
def format(self, item: Metadata) -> Media: return None
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
"name": "PDQ Hash Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "pdqhash", "numpy", "Pillow"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "pdqhash", "numpy", "PIL"],
|
||||
},
|
||||
"description": """
|
||||
PDQ Hash Enricher for generating perceptual hashes of media files.
|
||||
|
|
|
@ -1 +1 @@
|
|||
from .s3 import S3Storage
|
||||
from .s3_storage import S3Storage
|
|
@ -2,17 +2,17 @@
|
|||
"name": "S3 Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["boto3", "loguru"],
|
||||
"dependencies": {
|
||||
"python": ["hash_enricher", "boto3", "loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"default": "flat",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
"choices": ["flat", "url", "random"],
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"default": "static",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
|
@ -20,7 +20,9 @@
|
|||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
|
||||
"random_no_duplicate": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
|
@ -29,7 +31,9 @@
|
|||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
"private": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "if true S3 files will not be readable online"},
|
||||
},
|
||||
"description": """
|
||||
S3Storage: A storage module for saving media files to an S3-compatible object storage.
|
||||
|
@ -45,5 +49,6 @@
|
|||
- Requires S3 credentials (API key and secret) and a bucket name to function.
|
||||
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
|
||||
- Uses `boto3` for interaction with the S3 API.
|
||||
- Depends on the `HashEnricher` module for hash calculation.
|
||||
"""
|
||||
}
|
||||
|
|
|
@ -1,19 +1,19 @@
|
|||
|
||||
from typing import IO
|
||||
import boto3, os
|
||||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
import boto3
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
from auto_archiver.utils.misc import calculate_file_hash, random_str
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
|
||||
class S3Storage(Storage):
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
def setup(self) -> None:
|
||||
self.s3 = boto3.client(
|
||||
's3',
|
||||
region_name=self.region,
|
||||
|
@ -21,7 +21,6 @@ class S3Storage(Storage):
|
|||
aws_access_key_id=self.key,
|
||||
aws_secret_access_key=self.secret
|
||||
)
|
||||
self.random_no_duplicate = bool(self.random_no_duplicate)
|
||||
if self.random_no_duplicate:
|
||||
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
|
||||
|
||||
|
@ -41,15 +40,13 @@ class S3Storage(Storage):
|
|||
extra_args['ContentType'] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
|
||||
hd = he.calculate_hash(media.filename)
|
||||
hd = calculate_file_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
|
@ -61,8 +58,7 @@ class S3Storage(Storage):
|
|||
_, ext = os.path.splitext(media.key)
|
||||
media.key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
return True
|
||||
|
||||
|
||||
|
||||
def file_in_folder(self, path:str) -> str:
|
||||
# checks if path exists and is not an empty folder
|
||||
if not path.endswith('/'):
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Screenshot Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["loguru", "selenium"],
|
||||
"bin": ["chromedriver"]
|
||||
},
|
||||
|
|
|
@ -6,8 +6,8 @@ from selenium.common.exceptions import TimeoutException
|
|||
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.utils import Webdriver, UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
|
||||
|
@ -19,15 +19,17 @@ class ScreenshotEnricher(Enricher):
|
|||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
|
||||
auth = self.auth_for_site(url)
|
||||
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
|
||||
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
if self.save_to_pdf:
|
||||
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
|
||||
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
|
||||
pdf = driver.print_page(driver.print_options)
|
||||
with open(pdf_file, "wb") as f:
|
||||
f.write(base64.b64decode(pdf))
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
"name": "SSL Certificate Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "python-slugify"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "slugify"],
|
||||
},
|
||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
||||
"configs": {
|
||||
|
|
|
@ -4,7 +4,7 @@ from urllib.parse import urlparse
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class SSLEnricher(Enricher):
|
||||
|
@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
|
|||
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
|
||||
|
||||
cert = ssl.get_server_certificate((domain, 443))
|
||||
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
|
||||
with open(cert_fn, "w") as f: f.write(cert)
|
||||
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Telegram Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"requests",
|
||||
"bs4",
|
||||
|
@ -13,7 +13,7 @@
|
|||
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
|
||||
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
|
||||
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
|
||||
is advised for more comprehensive functionality.
|
||||
is advised for more comprehensive functionality, and higher quality media extraction.
|
||||
|
||||
### Features
|
||||
- Extracts images and videos from public Telegram message links (`t.me`).
|
||||
|
|
|
@ -1 +1 @@
|
|||
from .telethon_extractor import TelethonArchiver
|
||||
from .telethon_extractor import TelethonExtractor
|
|
@ -2,7 +2,7 @@
|
|||
"name": "telethon_extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["telethon",
|
||||
"loguru",
|
||||
"tqdm",
|
||||
|
|
|
@ -6,19 +6,20 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
|
|||
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
import re, time, os
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class TelethonArchiver(Extractor):
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
class TelethonExtractor(Extractor):
|
||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
||||
def setup(self) -> None:
|
||||
|
||||
"""
|
||||
1. makes a copy of session_file that is removed in cleanup
|
||||
2. trigger login process for telegram or proceed if already saved in a session file
|
||||
|
@ -92,7 +93,7 @@ class TelethonArchiver(Extractor):
|
|||
"""
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
match = self.link_pattern.search(url)
|
||||
match = self.valid_url.search(url)
|
||||
logger.debug(f"TELETHON: {match=}")
|
||||
if not match: return False
|
||||
|
||||
|
@ -120,7 +121,7 @@ class TelethonArchiver(Extractor):
|
|||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
title = post.message
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
"name": "Thumbnail Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "ffmpeg-python"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "ffmpeg"],
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"configs": {
|
||||
|
|
|
@ -10,7 +10,7 @@ import ffmpeg, os
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
||||
|
@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
|
|||
logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
|
||||
for m_id, m in enumerate(to_enrich.media[::]):
|
||||
if m.is_video():
|
||||
folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
|
||||
folder = os.path.join(self.tmp_dir, random_str(24))
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
logger.debug(f"generating thumbnails for {m.filename}")
|
||||
duration = m.get("duration")
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Timestamping Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"slugify",
|
||||
|
|
|
@ -10,8 +10,7 @@ from asn1crypto.core import Asn1Value
|
|||
import certifi
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
"""
|
||||
|
@ -33,7 +32,7 @@ class TimestampingEnricher(Enricher):
|
|||
logger.warning(f"No hashes found in {url=}")
|
||||
return
|
||||
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
hashes_fn = os.path.join(tmp_dir, "hashes.txt")
|
||||
|
||||
data_to_sign = "\n".join(hashes)
|
||||
|
@ -102,9 +101,9 @@ class TimestampingEnricher(Enricher):
|
|||
|
||||
cert_chain = []
|
||||
for cert in path:
|
||||
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
|
||||
with open(cert_fn, "wb") as f:
|
||||
f.write(cert.dump())
|
||||
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
|
||||
|
||||
return cert_chain
|
||||
return cert_chain
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "Twitter API Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": ["requests",
|
||||
"loguru",
|
||||
"pytwitter",
|
||||
|
|
|
@ -9,14 +9,13 @@ from pytwitter import Api
|
|||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata,Media
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
class TwitterApiExtractor(Extractor):
|
||||
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
super().setup(config)
|
||||
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def setup(self) -> None:
|
||||
self.api_index = 0
|
||||
self.apis = []
|
||||
if len(self.bearer_tokens):
|
||||
|
@ -54,7 +53,7 @@ class TwitterApiExtractor(Extractor):
|
|||
|
||||
def get_username_tweet_id(self, url):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
matches = self.valid_url.findall(url)
|
||||
if not len(matches): return False, False
|
||||
|
||||
username, tweet_id = matches[0] # only one URL supported
|
||||
|
|
|
@ -3,15 +3,19 @@
|
|||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"depends": ["core", "utils"],
|
||||
"external_dependencies": {
|
||||
"python": ["loguru",
|
||||
"vk_url_scraper"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "vk_url_scraper"],
|
||||
},
|
||||
"configs": {
|
||||
"username": {"default": None, "help": "valid VKontakte username"},
|
||||
"password": {"default": None, "help": "valid VKontakte password"},
|
||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||
"username": {"required": True,
|
||||
"help": "valid VKontakte username"},
|
||||
"password": {"required": True,
|
||||
"help": "valid VKontakte password"},
|
||||
"session_file": {
|
||||
"default": "secrets/vk_config.v2.json",
|
||||
"help": "valid VKontakte password",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
|
||||
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
|
||||
|
@ -31,6 +35,5 @@ To use the `VkArchiver`, you must provide valid VKontakte login credentials and
|
|||
|
||||
Credentials can be set in the configuration file or directly via environment variables. Ensure you
|
||||
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
|
||||
"""
|
||||
,
|
||||
""",
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
|
|||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class VkExtractor(Extractor):
|
||||
|
@ -12,10 +12,7 @@ class VkExtractor(Extractor):
|
|||
Currently only works for /wall posts
|
||||
"""
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("username")
|
||||
self.assert_valid_string("password")
|
||||
def setup(self) -> None:
|
||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
|
@ -37,7 +34,7 @@ class VkExtractor(Extractor):
|
|||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
|
||||
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
{
|
||||
"name": "WACZ Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"entry_point": "wacz_enricher::WaczExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"jsonlines",
|
||||
|
@ -25,6 +26,7 @@
|
|||
},
|
||||
"description": """
|
||||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
|
||||
|
||||
### Features
|
||||
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
|
||||
|
@ -33,7 +35,7 @@
|
|||
- Generates metadata from the archived page's content and structure (e.g., titles, text).
|
||||
|
||||
### Notes
|
||||
- Requires Docker for running `browsertrix-crawler` unless explicitly disabled.
|
||||
- Requires Docker for running `browsertrix-crawler` .
|
||||
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
|
||||
"""
|
||||
}
|
||||
|
|
|
@ -5,9 +5,9 @@ from zipfile import ZipFile
|
|||
from loguru import logger
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.core import Extractor, Enricher
|
||||
from auto_archiver.utils import UrlUtil, random_str
|
||||
from auto_archiver.utils import url as UrlUtil, random_str
|
||||
|
||||
|
||||
class WaczExtractorEnricher(Enricher, Extractor):
|
||||
|
@ -19,6 +19,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
"""
|
||||
|
||||
def setup(self) -> None:
|
||||
|
||||
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
||||
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
|
||||
|
||||
|
@ -49,7 +50,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
url = to_enrich.get_url()
|
||||
|
||||
collection = random_str(8)
|
||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
||||
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||
|
||||
cmd = [
|
||||
|
@ -152,7 +153,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||
logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")
|
||||
|
||||
# unzipping the .wacz
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
unzipped_dir = os.path.join(tmp_dir, "unzipped")
|
||||
with ZipFile(wacz_filename, 'r') as z_obj:
|
||||
z_obj.extractall(path=unzipped_dir)
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
from .wayback_enricher import WaybackExtractorEnricher
|
|
@ -1,30 +0,0 @@
|
|||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"entry_point": "wayback_enricher::WaybackExtractorEnricher",
|
||||
"configs": {
|
||||
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
|
||||
"if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
|
||||
"key": {"default": None, "required": True, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "required": True, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
|
||||
},
|
||||
"description": """
|
||||
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
|
||||
|
||||
### Features
|
||||
- Archives URLs using the Internet Archive's Wayback Machine API.
|
||||
- Supports conditional archiving based on the existence of prior archives within a specified time range.
|
||||
- Provides proxies for HTTP and HTTPS requests.
|
||||
- Fetches and confirms the archive URL or provides a job ID for later status checks.
|
||||
|
||||
### Notes
|
||||
- Requires a valid Wayback Machine API key and secret.
|
||||
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
|
||||
"""
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
from .wayback_extractor_enricher import WaybackExtractorEnricher
|
|
@ -0,0 +1,56 @@
|
|||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"timeout": {
|
||||
"default": 15,
|
||||
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
|
||||
},
|
||||
"if_not_archived_within": {
|
||||
"default": None,
|
||||
"help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA",
|
||||
},
|
||||
"key": {
|
||||
"required": True,
|
||||
"help": "wayback API key. to get credentials visit https://archive.org/account/s3.php",
|
||||
},
|
||||
"secret": {
|
||||
"required": True,
|
||||
"help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php",
|
||||
},
|
||||
"proxy_http": {
|
||||
"default": None,
|
||||
"help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
"proxy_https": {
|
||||
"default": None,
|
||||
"help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
|
||||
|
||||
### Features
|
||||
- Archives URLs using the Internet Archive's Wayback Machine API.
|
||||
- Supports conditional archiving based on the existence of prior archives within a specified time range.
|
||||
- Provides proxies for HTTP and HTTPS requests.
|
||||
- Fetches and confirms the archive URL or provides a job ID for later status checks.
|
||||
|
||||
### Notes
|
||||
- Requires a valid Wayback Machine API key and secret.
|
||||
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
|
||||
|
||||
### Steps to Get an Wayback API Key:
|
||||
- Sign up for an account at [Internet Archive](https://archive.org/account/signup).
|
||||
- Log in to your account.
|
||||
- Navigte to your [account settings](https://archive.org/account).
|
||||
- or: https://archive.org/developers/tutorial-get-ia-credentials.html
|
||||
- Under Wayback Machine API Keys, generate a new key.
|
||||
- Note down your API key and secret, as they will be required for authentication.
|
||||
""",
|
||||
}
|
|
@ -3,7 +3,7 @@ from loguru import logger
|
|||
import time, requests
|
||||
|
||||
from auto_archiver.core import Extractor, Enricher
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
class WaybackExtractorEnricher(Enricher, Extractor):
|
|
@ -2,15 +2,19 @@
|
|||
"name": "Whisper Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
"dependencies": {
|
||||
"python": ["s3_storage", "loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"api_endpoint": {"required": True,
|
||||
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"required": True,
|
||||
"help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
|
||||
"action": {"default": "translate",
|
||||
"help": "which Whisper operation to execute",
|
||||
"choices": ["transcribe", "translate", "language_detection"]},
|
||||
},
|
||||
"description": """
|
||||
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
|
||||
|
@ -25,6 +29,7 @@
|
|||
### Notes
|
||||
- Requires a Whisper API endpoint and API key for authentication.
|
||||
- Only compatible with S3-compatible storage systems for media file accessibility.
|
||||
- ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
|
||||
- Handles multiple jobs and retries for failed or incomplete processing.
|
||||
"""
|
||||
}
|
||||
|
|
|
@ -3,9 +3,8 @@ import requests, time
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.modules.s3_storage import S3Storage
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
"""
|
||||
|
@ -14,18 +13,25 @@ class WhisperEnricher(Enricher):
|
|||
Only works if an S3 compatible storage is used
|
||||
"""
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not self._get_s3_storage():
|
||||
def setup(self) -> None:
|
||||
self.stores = self.config['steps']['storages']
|
||||
self.s3 = get_module("s3_storage", self.config)
|
||||
if not "s3_storage" in self.stores:
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
||||
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
m.store(url=url, metadata=to_enrich)
|
||||
# TODO: this used to pass all storage items to store now
|
||||
# Now only passing S3, the rest will get added later in the usual order (?)
|
||||
m.store(url=url, metadata=to_enrich, storages=[self.s3])
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
job_results[job_id] = False
|
||||
|
@ -53,8 +59,8 @@ class WhisperEnricher(Enricher):
|
|||
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
||||
|
||||
def submit_job(self, media: Media):
|
||||
s3 = self._get_s3_storage()
|
||||
s3_url = s3.get_cdn_url(media)
|
||||
|
||||
s3_url = self.s3.get_cdn_url(media)
|
||||
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
||||
payload = {
|
||||
"url": s3_url,
|
||||
|
@ -107,10 +113,3 @@ class WhisperEnricher(Enricher):
|
|||
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
||||
return result
|
||||
return False
|
||||
|
||||
def _get_s3_storage(self) -> S3Storage:
|
||||
try:
|
||||
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
|
||||
except:
|
||||
logger.warning("No S3Storage instance found in storages")
|
||||
return
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .misc import *
|
||||
from .webdriver import Webdriver
|
||||
from .url import UrlUtil
|
||||
from .atlos import get_atlos_config_options
|
||||
|
||||
# handy utils from ytdlp
|
||||
|
|
|
@ -1,53 +0,0 @@
|
|||
import json, gspread
|
||||
|
||||
from ..core import BaseModule
|
||||
|
||||
|
||||
class Gsheets(BaseModule):
|
||||
name = "gsheets"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO: config should be responsible for conversions
|
||||
try: self.header = int(self.header)
|
||||
except: pass
|
||||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||
assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
|
||||
|
||||
# TODO merge this into gsheets processors manifest
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
},
|
||||
}
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
|
@ -1,7 +1,9 @@
|
|||
|
||||
import os, json, requests
|
||||
import os
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
import requests
|
||||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
@ -51,9 +53,52 @@ def update_nested_dict(dictionary, update_dict):
|
|||
else:
|
||||
dictionary[key] = value
|
||||
|
||||
|
||||
def random_str(length: int = 32) -> str:
|
||||
assert length <= 32, "length must be less than 32 as UUID4 is used"
|
||||
return str(uuid.uuid4()).replace("-", "")[:length]
|
||||
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
|
||||
|
||||
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
|
||||
hash = hash_algo()
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(chunksize)
|
||||
if not buf: break
|
||||
hash.update(buf)
|
||||
return hash.hexdigest()
|
||||
|
||||
def get_current_datetime_iso() -> str:
|
||||
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None:
|
||||
# parse a datetime string with option of passing a specific format
|
||||
try:
|
||||
return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str)
|
||||
except ValueError as e:
|
||||
logger.error(f"Unable to parse datestring {dt_str}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
|
||||
# Consistent parsing of timestamps
|
||||
# If utc=True, the timezone is set to UTC,
|
||||
# if iso=True, the output is an iso string
|
||||
if not ts: return
|
||||
try:
|
||||
if isinstance(ts, str): ts = datetime.fromisoformat(ts)
|
||||
if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
|
||||
if utc: ts = ts.replace(tzinfo=timezone.utc)
|
||||
if iso: return ts.isoformat()
|
||||
return ts
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
||||
return None
|
||||
|
||||
def get_current_timestamp() -> str:
|
||||
return get_timestamp(datetime.now())
|
|
@ -1,79 +1,84 @@
|
|||
import re
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
class UrlUtil:
|
||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||
|
||||
@staticmethod
|
||||
def clean(url: str) -> str: return url
|
||||
AUTHWALL_URLS = [
|
||||
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def is_auth_wall(url: str) -> bool:
|
||||
"""
|
||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||
"""
|
||||
if UrlUtil.telegram_private.match(url): return True
|
||||
if UrlUtil.is_istagram.match(url): return True
|
||||
def domain_for_url(url: str) -> str:
|
||||
"""
|
||||
SECURITY: parse the domain using urllib to avoid any potential security issues
|
||||
"""
|
||||
return urlparse(url).netloc
|
||||
|
||||
return False
|
||||
def clean(url: str) -> str:
|
||||
return url
|
||||
|
||||
@staticmethod
|
||||
def remove_get_parameters(url: str) -> str:
|
||||
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
||||
# useful for mimetypes to work
|
||||
parsed_url = urlparse(url)
|
||||
new_url = urlunparse(parsed_url._replace(query=''))
|
||||
return new_url
|
||||
def is_auth_wall(url: str) -> bool:
|
||||
"""
|
||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||
"""
|
||||
for regex in AUTHWALL_URLS:
|
||||
if regex.match(url):
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def is_relevant_url(url: str) -> bool:
|
||||
"""
|
||||
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
|
||||
"""
|
||||
clean_url = UrlUtil.remove_get_parameters(url)
|
||||
return False
|
||||
|
||||
# favicons
|
||||
if "favicon" in url: return False
|
||||
# ifnore icons
|
||||
if clean_url.endswith(".ico"): return False
|
||||
# ignore SVGs
|
||||
if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
|
||||
def remove_get_parameters(url: str) -> str:
|
||||
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
||||
# useful for mimetypes to work
|
||||
parsed_url = urlparse(url)
|
||||
new_url = urlunparse(parsed_url._replace(query=''))
|
||||
return new_url
|
||||
|
||||
# twitter profile pictures
|
||||
if "twimg.com/profile_images" in url: return False
|
||||
if "twimg.com" in url and "/default_profile_images" in url: return False
|
||||
def is_relevant_url(url: str) -> bool:
|
||||
"""
|
||||
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
|
||||
"""
|
||||
clean_url = remove_get_parameters(url)
|
||||
|
||||
# instagram profile pictures
|
||||
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
|
||||
# instagram recurring images
|
||||
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
|
||||
# favicons
|
||||
if "favicon" in url: return False
|
||||
# ifnore icons
|
||||
if clean_url.endswith(".ico"): return False
|
||||
# ignore SVGs
|
||||
if remove_get_parameters(url).endswith(".svg"): return False
|
||||
|
||||
# telegram
|
||||
if "https://telegram.org/img/emoji/" in url: return False
|
||||
# twitter profile pictures
|
||||
if "twimg.com/profile_images" in url: return False
|
||||
if "twimg.com" in url and "/default_profile_images" in url: return False
|
||||
|
||||
# youtube
|
||||
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
|
||||
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
|
||||
if "https://www.youtube.com/s/search/audio/" in url: return False
|
||||
# instagram profile pictures
|
||||
if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
|
||||
# instagram recurring images
|
||||
if "https://static.cdninstagram.com/rsrc.php/" in url: return False
|
||||
|
||||
# ok
|
||||
if " https://ok.ru/res/i/" in url: return False
|
||||
# telegram
|
||||
if "https://telegram.org/img/emoji/" in url: return False
|
||||
|
||||
# vk
|
||||
if "https://vk.com/emoji/" in url: return False
|
||||
if "vk.com/images/" in url: return False
|
||||
if "vk.com/images/reaction/" in url: return False
|
||||
# youtube
|
||||
if "https://www.youtube.com/s/gaming/emoji/" in url: return False
|
||||
if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
|
||||
if "https://www.youtube.com/s/search/audio/" in url: return False
|
||||
|
||||
# wikipedia
|
||||
if "wikipedia.org/static" in url: return False
|
||||
# ok
|
||||
if " https://ok.ru/res/i/" in url: return False
|
||||
|
||||
return True
|
||||
# vk
|
||||
if "https://vk.com/emoji/" in url: return False
|
||||
if "vk.com/images/" in url: return False
|
||||
if "vk.com/images/reaction/" in url: return False
|
||||
|
||||
@staticmethod
|
||||
def twitter_best_quality_url(url: str) -> str:
|
||||
"""
|
||||
some twitter image URLs point to a less-than best quality
|
||||
this returns the URL pointing to the highest (original) quality
|
||||
"""
|
||||
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
||||
# wikipedia
|
||||
if "wikipedia.org/static" in url: return False
|
||||
|
||||
return True
|
||||
|
||||
def twitter_best_quality_url(url: str) -> str:
|
||||
"""
|
||||
some twitter image URLs point to a less-than best quality
|
||||
this returns the URL pointing to the highest (original) quality
|
||||
"""
|
||||
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
||||
|
|
|
@ -9,12 +9,79 @@ from loguru import logger
|
|||
from selenium.webdriver.common.by import By
|
||||
import time
|
||||
|
||||
#import domain_for_url
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from http.cookiejar import MozillaCookieJar
|
||||
|
||||
class CookieSettingDriver(webdriver.Firefox):
|
||||
|
||||
facebook_accept_cookies: bool
|
||||
cookies: str
|
||||
cookiejar: MozillaCookieJar
|
||||
|
||||
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
|
||||
super(CookieSettingDriver, self).__init__(*args, **kwargs)
|
||||
self.cookies = cookies
|
||||
self.cookiejar = cookiejar
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
|
||||
def get(self, url: str):
|
||||
if self.cookies or self.cookiejar:
|
||||
# set up the driver to make it not 'cookie averse' (needs a context/URL)
|
||||
# get the 'robots.txt' file which should be quick and easy
|
||||
robots_url = urlunparse(urlparse(url)._replace(path='/robots.txt', query='', fragment=''))
|
||||
super(CookieSettingDriver, self).get(robots_url)
|
||||
|
||||
if self.cookies:
|
||||
# an explicit cookie is set for this site, use that first
|
||||
for cookie in self.cookies.split(";"):
|
||||
for name, value in cookie.split("="):
|
||||
self.driver.add_cookie({'name': name, 'value': value})
|
||||
elif self.cookiejar:
|
||||
domain = urlparse(url).netloc.lstrip("www.")
|
||||
for cookie in self.cookiejar:
|
||||
if domain in cookie.domain:
|
||||
try:
|
||||
self.add_cookie({
|
||||
'name': cookie.name,
|
||||
'value': cookie.value,
|
||||
'path': cookie.path,
|
||||
'domain': cookie.domain,
|
||||
'secure': bool(cookie.secure),
|
||||
'expiry': cookie.expires
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to add cookie to webdriver: {e}")
|
||||
|
||||
if self.facebook_accept_cookies:
|
||||
try:
|
||||
logger.debug(f'Trying fb click accept cookie popup.')
|
||||
super(CookieSettingDriver, self).get("http://www.facebook.com")
|
||||
essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
|
||||
essential_only.click()
|
||||
logger.debug(f'fb click worked')
|
||||
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
logger.warning(f'Failed on fb accept cookies.', e)
|
||||
# now get the actual URL
|
||||
super(CookieSettingDriver, self).get(url)
|
||||
if self.facebook_accept_cookies:
|
||||
# try and click the 'close' button on the 'login' window to close it
|
||||
close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
|
||||
if close_button:
|
||||
close_button.click()
|
||||
|
||||
|
||||
|
||||
class Webdriver:
|
||||
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
|
||||
def __init__(self, width: int, height: int, timeout_seconds: int,
|
||||
facebook_accept_cookies: bool = False, http_proxy: str = "",
|
||||
print_options: dict = {}, auth: dict = {}) -> webdriver:
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.auth = auth
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
self.http_proxy = http_proxy
|
||||
# create and set print options
|
||||
|
@ -23,32 +90,26 @@ class Webdriver:
|
|||
setattr(self.print_options, k, v)
|
||||
|
||||
def __enter__(self) -> webdriver:
|
||||
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument(f'--proxy-server={self.http_proxy}')
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
|
||||
if self.facebook_accept_cookies:
|
||||
options.add_argument('--lang=en')
|
||||
|
||||
try:
|
||||
self.driver = webdriver.Firefox(options=options)
|
||||
self.driver = CookieSettingDriver(cookies=self.auth.get('cookies'), cookiejar=self.auth.get('cookies_jar'),
|
||||
facebook_accept_cookies=self.facebook_accept_cookies, options=options)
|
||||
self.driver.set_window_size(self.width, self.height)
|
||||
self.driver.set_page_load_timeout(self.timeout_seconds)
|
||||
self.driver.print_options = self.print_options
|
||||
except TimeoutException as e:
|
||||
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
|
||||
|
||||
if self.facebook_accept_cookies:
|
||||
try:
|
||||
logger.debug(f'Trying fb click accept cookie popup.')
|
||||
self.driver.get("http://www.facebook.com")
|
||||
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
|
||||
foo.click()
|
||||
logger.debug(f'fb click worked')
|
||||
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
|
||||
time.sleep(2)
|
||||
except:
|
||||
logger.warning(f'Failed on fb accept cookies.')
|
||||
|
||||
return self.driver
|
||||
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.driver.close()
|
||||
self.driver.quit()
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
import tempfile
|
||||
|
||||
from auto_archiver.core.context import ArchivingContext
|
||||
|
||||
ArchivingContext.reset(full_reset=True)
|
||||
ArchivingContext.set_tmp_dir(tempfile.gettempdir())
|
|
@ -1,7 +1,9 @@
|
|||
"""
|
||||
pytest conftest file, for shared fixtures and configuration
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Dict, Tuple
|
||||
import hashlib
|
||||
import pytest
|
||||
|
@ -23,13 +25,15 @@ def setup_module(request):
|
|||
# if the class does not have a .name, use the name of the parent folder
|
||||
module_name = module_name.__module__.rsplit(".",2)[-2]
|
||||
|
||||
m = get_module(module_name).load()
|
||||
m.name = module_name
|
||||
m.setup({module_name : config})
|
||||
m = get_module(module_name, {module_name: config})
|
||||
|
||||
# add the tmp_dir to the module
|
||||
tmp_dir = TemporaryDirectory()
|
||||
m.tmp_dir = tmp_dir.name
|
||||
|
||||
def cleanup():
|
||||
_LAZY_LOADED_MODULES.pop(module_name)
|
||||
tmp_dir.cleanup()
|
||||
request.addfinalizer(cleanup)
|
||||
|
||||
return m
|
||||
|
@ -110,4 +114,18 @@ def pytest_runtest_setup(item):
|
|||
test_name = _test_failed_incremental[cls_name].get((), None)
|
||||
# if name found, test has failed for the combination of class name & test name
|
||||
if test_name is not None:
|
||||
pytest.xfail(f"previous test failed ({test_name})")
|
||||
pytest.xfail(f"previous test failed ({test_name})")
|
||||
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def unpickle():
|
||||
"""
|
||||
Returns a helper function that unpickles a file
|
||||
** gets the file from the test_files directory: tests/data/test_files **
|
||||
"""
|
||||
def _unpickle(path):
|
||||
test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files")
|
||||
with open(os.path.join(test_data_dir, path), "rb") as f:
|
||||
return pickle.load(f)
|
||||
return _unpickle
|
|
@ -0,0 +1,2 @@
|
|||
https://example.com/1/,data 1
|
||||
https://example.com/2/,data 2
|
|
Some files were not shown because too many files have changed in this diff Show More
Ładowanie…
Reference in New Issue