From 753039240f1f677d2f7756358c42b413b3046d87 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 21 Jan 2023 19:01:02 +0000 Subject: [PATCH] pyproject --- .gitignore | 4 +- Pipfile | 2 + Pipfile.lock | 189 ++++++----- pyproject.toml | 4 + setup.cfg | 49 +++ src/__init__.py | 1 + src/auto_archiver/__init__.py | 7 + src/auto_archiver/__main__.py | 12 + src/{ => auto_archiver}/archivers/__init__.py | 2 +- src/{ => auto_archiver}/archivers/archiver.py | 4 +- .../archivers/base_archiver.py | 2 +- .../archivers/instagram_archiverv2.py | 7 +- .../archivers/telegram_archiverv2.py | 6 +- .../archivers/telethon_archiverv2.py | 7 +- .../archivers/tiktok_archiverv2.py | 6 +- .../archivers/twitter_api_archiverv2.py | 6 +- .../archivers/twitter_archiverv2.py | 6 +- .../archivers/vk_archiverv2.py | 8 +- .../archivers/youtubedl_archiverv2.py | 6 +- src/{ => auto_archiver}/auto_archive.py | 0 src/{ => auto_archiver}/auto_auto_archive.py | 0 src/{ => auto_archiver}/cli.py | 0 src/auto_archiver/core/__init__.py | 7 + src/{ => auto_archiver/core}/media.py | 4 +- src/{ => auto_archiver/core}/metadata.py | 6 +- src/{ => auto_archiver/core}/orchestrator.py | 91 ++---- src/{steps => auto_archiver/core}/step.py | 2 +- .../core}/v2config.py | 67 ++-- src/auto_archiver/databases/__init__.py | 3 + src/auto_archiver/databases/console_db.py | 32 ++ src/{ => auto_archiver}/databases/database.py | 4 +- .../databases/gsheet_db.py | 12 +- src/{ => auto_archiver}/enrichers/__init__.py | 0 src/{ => auto_archiver}/enrichers/enricher.py | 4 +- .../enrichers/hash_enricher.py | 7 +- .../enrichers/screenshot_enricher.py | 8 +- .../enrichers/thumbnail_enricher.py | 6 +- .../enrichers/wacz_enricher.py | 8 +- .../enrichers/wayback_enricher.py | 6 +- src/{ => auto_archiver}/feeders/__init__.py | 0 src/{ => auto_archiver}/feeders/feeder.py | 4 +- .../feeders/gsheet_feeder.py | 10 +- .../formatters/__init__.py | 0 .../formatters/formatter.py | 4 +- .../formatters/html_formatter.py | 8 +- .../formatters/templates/html_template.html | 0 .../formatters/templates/macros.html | 0 src/auto_archiver/storages/__init__.py | 9 + .../storages/base_storage.py | 0 .../storages/gd_storage.py | 0 src/{ => auto_archiver}/storages/local.py | 7 +- src/{ => auto_archiver}/storages/s3.py | 6 +- .../storages/s3_storage.py | 0 src/{ => auto_archiver}/storages/storage.py | 5 +- src/{ => auto_archiver}/utils/__init__.py | 3 +- src/{steps => auto_archiver/utils}/gsheet.py | 2 +- src/{ => auto_archiver}/utils/gworksheet.py | 0 src/{ => auto_archiver}/utils/misc.py | 0 src/{ => auto_archiver}/utils/util.py | 3 +- src/{ => auto_archiver}/utils/webdriver.py | 0 src/configs/__init__.py | 7 - src/configs/browsertrix_config.py | 7 - src/configs/config.py | 309 ------------------ src/configs/instagram_config.py | 9 - src/configs/selenium_config.py | 8 - src/configs/telethon_config.py | 10 - src/configs/twitter_api_config.py | 11 - src/configs/vk_config.py | 9 - src/configs/wayback_config.py | 8 - src/databases/__init__.py | 2 - src/storages/__init__.py | 9 - src/storages/local_storage.py | 36 -- src/v2.py | 12 - 73 files changed, 404 insertions(+), 689 deletions(-) create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 src/auto_archiver/__init__.py create mode 100644 src/auto_archiver/__main__.py rename src/{ => auto_archiver}/archivers/__init__.py (89%) rename src/{ => auto_archiver}/archivers/archiver.py (96%) rename src/{ => auto_archiver}/archivers/base_archiver.py (99%) rename src/{ => auto_archiver}/archivers/instagram_archiverv2.py (98%) rename src/{ => auto_archiver}/archivers/telegram_archiverv2.py (96%) rename src/{ => auto_archiver}/archivers/telethon_archiverv2.py (99%) rename src/{ => auto_archiver}/archivers/tiktok_archiverv2.py (95%) rename src/{ => auto_archiver}/archivers/twitter_api_archiverv2.py (98%) rename src/{ => auto_archiver}/archivers/twitter_archiverv2.py (98%) rename src/{ => auto_archiver}/archivers/vk_archiverv2.py (93%) rename src/{ => auto_archiver}/archivers/youtubedl_archiverv2.py (96%) rename src/{ => auto_archiver}/auto_archive.py (100%) rename src/{ => auto_archiver}/auto_auto_archive.py (100%) rename src/{ => auto_archiver}/cli.py (100%) create mode 100644 src/auto_archiver/core/__init__.py rename src/{ => auto_archiver/core}/media.py (93%) rename src/{ => auto_archiver/core}/metadata.py (98%) rename src/{ => auto_archiver/core}/orchestrator.py (72%) rename src/{steps => auto_archiver/core}/step.py (97%) rename src/{configs => auto_archiver/core}/v2config.py (66%) create mode 100644 src/auto_archiver/databases/__init__.py create mode 100644 src/auto_archiver/databases/console_db.py rename src/{ => auto_archiver}/databases/database.py (95%) rename src/{ => auto_archiver}/databases/gsheet_db.py (96%) rename src/{ => auto_archiver}/enrichers/__init__.py (100%) rename src/{ => auto_archiver}/enrichers/enricher.py (89%) rename src/{ => auto_archiver}/enrichers/hash_enricher.py (96%) rename src/{ => auto_archiver}/enrichers/screenshot_enricher.py (94%) rename src/{ => auto_archiver}/enrichers/thumbnail_enricher.py (96%) rename src/{ => auto_archiver}/enrichers/wacz_enricher.py (96%) rename src/{ => auto_archiver}/enrichers/wayback_enricher.py (98%) rename src/{ => auto_archiver}/feeders/__init__.py (100%) rename src/{ => auto_archiver}/feeders/feeder.py (80%) rename src/{ => auto_archiver}/feeders/gsheet_feeder.py (96%) rename src/{ => auto_archiver}/formatters/__init__.py (100%) rename src/{ => auto_archiver}/formatters/formatter.py (80%) rename src/{ => auto_archiver}/formatters/html_formatter.py (96%) rename src/{ => auto_archiver}/formatters/templates/html_template.html (100%) rename src/{ => auto_archiver}/formatters/templates/macros.html (100%) create mode 100644 src/auto_archiver/storages/__init__.py rename src/{ => auto_archiver}/storages/base_storage.py (100%) rename src/{ => auto_archiver}/storages/gd_storage.py (100%) rename src/{ => auto_archiver}/storages/local.py (94%) rename src/{ => auto_archiver}/storages/s3.py (97%) rename src/{ => auto_archiver}/storages/s3_storage.py (100%) rename src/{ => auto_archiver}/storages/storage.py (95%) rename src/{ => auto_archiver}/utils/__init__.py (69%) rename src/{steps => auto_archiver/utils}/gsheet.py (98%) rename src/{ => auto_archiver}/utils/gworksheet.py (100%) rename src/{ => auto_archiver}/utils/misc.py (100%) rename src/{ => auto_archiver}/utils/util.py (88%) rename src/{ => auto_archiver}/utils/webdriver.py (100%) delete mode 100644 src/configs/__init__.py delete mode 100644 src/configs/browsertrix_config.py delete mode 100644 src/configs/config.py delete mode 100644 src/configs/instagram_config.py delete mode 100644 src/configs/selenium_config.py delete mode 100644 src/configs/telethon_config.py delete mode 100644 src/configs/twitter_api_config.py delete mode 100644 src/configs/vk_config.py delete mode 100644 src/configs/wayback_config.py delete mode 100644 src/databases/__init__.py delete mode 100644 src/storages/__init__.py delete mode 100644 src/storages/local_storage.py delete mode 100644 src/v2.py diff --git a/.gitignore b/.gitignore index 88ccd0e..2c55563 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,6 @@ browsertrix/* browsertrix-tmp/* instaloader/* instaloader.session -orchestration.yaml \ No newline at end of file +orchestration.yaml +auto_archiver.egg-info* +logs* \ No newline at end of file diff --git a/Pipfile b/Pipfile index d79388d..9a4e54f 100644 --- a/Pipfile +++ b/Pipfile @@ -29,9 +29,11 @@ instaloader = "*" tqdm = "*" jinja2 = "*" cryptography = "==38.0.4" +dataclasses-json = "*" [requires] python_version = "3.9" [dev-packages] autopep8 = "*" +setuptools-pipfile = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 83e2607..8c29e0b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "bcc36e9ecdf6d383a1010629484eec271699ac23b40be045d9a9669b4c9fac8c" + "sha256": "e2f5d017d9bc9eef90cced189b6e3017d740c35d204962479417109a4deeb7f4" }, "pipfile-spec": 6, "requires": { @@ -57,19 +57,19 @@ }, "boto3": { "hashes": [ - "sha256:96055651f7be882175aa334ad46528e1ad79fb8ca33fa9c3998cc1d985b34eab", - "sha256:e24d65c31780c208768ebcd152d8a0181591c9c8e7d971e23f318d7f41910ba1" + "sha256:4e876ba5d64928cde0c416dd844f04f22d6b73d14002bbc3ca55591f80f49927", + "sha256:c729bb0af76e85a2776b6bd3da8d9fa0f4b91b425eab51612aa53956f644ee23" ], "index": "pypi", - "version": "==1.26.46" + "version": "==1.26.54" }, "botocore": { "hashes": [ - "sha256:78bf25933e35eb6354a9e80fe156f86dce4d346a92afe364dfce25c17ab0639f", - "sha256:dbac2fde265f13beb9191ec3ff63b90b515e9ed63875edc3afbd72c5f585e48b" + "sha256:ca3ef7588daa664fe196d3234718db5f6b5dab961507500b4bb921e31133eea1", + "sha256:f2fe17ed6b8e163769a715f81cb6ce3d4628d172918de535256bdf34d29b704f" ], "markers": "python_version >= '3.7'", - "version": "==1.29.46" + "version": "==1.29.54" }, "brotli": { "hashes": [ @@ -269,17 +269,10 @@ }, "cloudscraper": { "hashes": [ - "sha256:2776c70f3661c028e59fd306ac2b104882c9b3cb3f798086251e00fc2d72c3a2", - "sha256:3b9753724616ac4d811e7922ddc9dba9b4419749ebaa35b0ba503d442522df2e" + "sha256:401409859697edae9384a7623b450cc97ab14dd0b2c8cdcac62edc2d50b31741", + "sha256:4d02aceffa90abd4dabc75b79bafa31636309baa7c0f2ee665e2d345aadb8863" ], - "version": "==1.2.67" - }, - "commonmark": { - "hashes": [ - "sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60", - "sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9" - ], - "version": "==0.9.1" + "version": "==1.2.68" }, "cryptography": { "hashes": [ @@ -318,16 +311,16 @@ "sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd", "sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90" ], - "markers": "python_version >= '3.6'", + "index": "pypi", "version": "==0.5.7" }, "dateparser": { "hashes": [ - "sha256:107f3cc87a60770e10d111349adc1504224a6b60753a47a64b0ec842ab85b5a9", - "sha256:ceb159f1b4a9df54ed6209e91298097deafde476037f8611b4cb2b1cb8b31c58" + "sha256:c47b6e4b8c4b2b2a21690111b6571b6991295ba327ec6503753abeebf5e80696", + "sha256:e703db1815270c020552f4b3e3a981937b48b2cbcfcef5347071b74788dd9214" ], "index": "pypi", - "version": "==1.1.5" + "version": "==1.1.6" }, "exceptiongroup": { "hashes": [ @@ -363,10 +356,10 @@ }, "future": { "hashes": [ - "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d" + "sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307" ], "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==0.18.2" + "version": "==0.18.3" }, "google-api-core": { "hashes": [ @@ -378,11 +371,11 @@ }, "google-api-python-client": { "hashes": [ - "sha256:9412ad3445518fa9d24d02c673a70b07c9d124990f44763cdf4f5304ca5b4d08", - "sha256:a4ea351db2bb2a9b1a7e96d8fa8de0fcbc31d9e237b724f4a07b243c2d63e9a4" + "sha256:7e860e3ec27b504fb797fa23c07c012a874dd736491fddbe50a20d3bdde8ace6", + "sha256:bafce2a02b06ee501df039eba5874afc7d28c9cf5ef92253327776448706556d" ], "index": "pypi", - "version": "==2.71.0" + "version": "==2.73.0" }, "google-auth": { "hashes": [ @@ -570,51 +563,69 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==4.9.2" }, - "markupsafe": { + "markdown-it-py": { "hashes": [ - "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003", - "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88", - "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5", - "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7", - "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a", - "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603", - "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1", - "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135", - "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247", - "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6", - "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601", - "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77", - "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02", - "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e", - "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63", - "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f", - "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980", - "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b", - "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812", - "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff", - "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96", - "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1", - "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925", - "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a", - "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6", - "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e", - "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f", - "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4", - "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f", - "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3", - "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c", - "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a", - "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417", - "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a", - "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a", - "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37", - "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452", - "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933", - "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", - "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" + "sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27", + "sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da" ], "markers": "python_version >= '3.7'", - "version": "==2.1.1" + "version": "==2.1.0" + }, + "markupsafe": { + "hashes": [ + "sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed", + "sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc", + "sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2", + "sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460", + "sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7", + "sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0", + "sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1", + "sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa", + "sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03", + "sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323", + "sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65", + "sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013", + "sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036", + "sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f", + "sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4", + "sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419", + "sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2", + "sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619", + "sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a", + "sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a", + "sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd", + "sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7", + "sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666", + "sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65", + "sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859", + "sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625", + "sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff", + "sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156", + "sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd", + "sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba", + "sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f", + "sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1", + "sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094", + "sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a", + "sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513", + "sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed", + "sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d", + "sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3", + "sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147", + "sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c", + "sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603", + "sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601", + "sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a", + "sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1", + "sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d", + "sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3", + "sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54", + "sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2", + "sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6", + "sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58" + ], + "markers": "python_version >= '3.7'", + "version": "==2.1.2" }, "marshmallow": { "hashes": [ @@ -631,6 +642,14 @@ ], "version": "==1.5.1" }, + "mdurl": { + "hashes": [ + "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", + "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba" + ], + "markers": "python_version >= '3.7'", + "version": "==0.1.2" + }, "mutagen": { "hashes": [ "sha256:6e5f8ba84836b99fe60be5fb27f84be4ad919bbb6b49caa6ae81e70584b55e58", @@ -837,10 +856,10 @@ }, "pytz": { "hashes": [ - "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a", - "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd" + "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0", + "sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a" ], - "version": "==2022.7" + "version": "==2022.7.1" }, "pytz-deprecation-shim": { "hashes": [ @@ -1016,11 +1035,11 @@ }, "rich": { "hashes": [ - "sha256:25f83363f636995627a99f6e4abc52ed0970ebbd544960cc63cbb43aaac3d6f0", - "sha256:41fe1d05f433b0f4724cda8345219213d2bfa472ef56b2f64f415b5b94d51b04" + "sha256:7c963f0d03819221e9ac561e1bc866e3f95a02248c1234daa48954e6d381c003", + "sha256:f1a00cdd3eebf999a15d85ec498bfe0b1a77efe9b34f645768a54132ef444ac5" ], "markers": "python_version >= '3.7'", - "version": "==13.0.1" + "version": "==13.2.0" }, "rsa": { "hashes": [ @@ -1064,11 +1083,11 @@ }, "snscrape": { "hashes": [ - "sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3", - "sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2" + "sha256:106bd375d47b683f88e96acbf425747358fd851f5282a91a0fa0c6784f29f2e4", + "sha256:194078946ff53c8b2a79db7695dde351819b7849009aa137e26cda924d3ae702" ], "index": "pypi", - "version": "==0.4.3.20220106" + "version": "==0.5.0.20230113" }, "sortedcontainers": { "hashes": [ @@ -1310,6 +1329,22 @@ "markers": "python_version >= '3.6'", "version": "==2.10.0" }, + "setuptools-pipfile": { + "hashes": [ + "sha256:54cb6bf6a662fe74951425d509772a5302d1cf723d9a3654d19c2468d3d80b6b", + "sha256:f6049892af8e8233a438cf00fb4477fe81de3ea0e8e90c1241d196cb40f703b5" + ], + "index": "pypi", + "version": "==0.7.0" + }, + "toml": { + "hashes": [ + "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", + "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.2" + }, "tomli": { "hashes": [ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0b46f6f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[build-system] +requires = ["setuptools", "wheel", "setuptools-pipfile"] +build-backend = "setuptools.build_meta" +[tool.setuptools-pipfile] \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..f7dce11 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,49 @@ +[metadata] +name = auto_archiver +version = 2.0.0 +author = Bellingcat +author_email = tech@bellingcat.com +description = Easily archive online media content +long_description = file: README.md, LICENSE +keywords = archive, oosi, osint, scraping +license = MIT +classifiers = + Intended Audience :: Developers, + Intended Audience :: Science/Research, + License :: OSI Approved :: MIT License, + Programming Language :: Python :: 3, + +[options] +setup_requires = + setuptools-pipfile +zip_safe = False +include_package_data = True +package_dir= + =src +packages=find: +find_packages=true +python_requires = >=3.8 + +# [options.package_data] +# * = *.txt, *.rst +# hello = *.msg + +[options.entry_points] +console_scripts = + auto-archiver = auto_archiver.__main__:main + +# [options.extras_require] +# pdf = ReportLab>=1.2; RXP +# rest = docutils>=0.3; pack ==1.1, ==1.3 + +[options.packages.find] +where=src +# include=auto_archiver* +# exclude = +# examples* +# .eggs* +# build* +# secrets* +# tmp* +# docs* +# src.tests* \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index e69de29..c06311a 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1 @@ +# from .auto_archiver import * \ No newline at end of file diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py new file mode 100644 index 0000000..c02d330 --- /dev/null +++ b/src/auto_archiver/__init__.py @@ -0,0 +1,7 @@ +from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core + +# need to manually specify due to cyclical deps +from .core.orchestrator import ArchivingOrchestrator +from .core.v2config import ConfigV2 +# making accessible directly +from .core.metadata import Metadata \ No newline at end of file diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py new file mode 100644 index 0000000..812e4e2 --- /dev/null +++ b/src/auto_archiver/__main__.py @@ -0,0 +1,12 @@ +from . import ConfigV2 +from . import ArchivingOrchestrator + +def main(): + config = ConfigV2() + config.parse() + orchestrator = ArchivingOrchestrator(config) + orchestrator.feed() + + +if __name__ == "__main__": + main() diff --git a/src/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py similarity index 89% rename from src/archivers/__init__.py rename to src/auto_archiver/archivers/__init__.py index 22e142f..595ea8c 100644 --- a/src/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -1,5 +1,5 @@ # we need to explicitly expose the available imports here -from .base_archiver import Archiver, ArchiveResult +# from .base_archiver import Archiver, ArchiveResult # from .telegram_archiver import TelegramArchiver # from .telethon_archiver import TelethonArchiver # from .tiktok_archiver import TiktokArchiver diff --git a/src/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py similarity index 96% rename from src/archivers/archiver.py rename to src/auto_archiver/archivers/archiver.py index 7682e11..d09044e 100644 --- a/src/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -2,9 +2,9 @@ from __future__ import annotations from abc import abstractmethod from dataclasses import dataclass import os -from metadata import Metadata -from steps.step import Step import mimetypes, requests +from ..core import Metadata +from ..core import Step @dataclass diff --git a/src/archivers/base_archiver.py b/src/auto_archiver/archivers/base_archiver.py similarity index 99% rename from src/archivers/base_archiver.py rename to src/auto_archiver/archivers/base_archiver.py index 75395b5..103b9ff 100644 --- a/src/archivers/base_archiver.py +++ b/src/auto_archiver/archivers/base_archiver.py @@ -11,7 +11,7 @@ from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from slugify import slugify -from configs import Config +from ..configs import Config from storages import Storage from utils import mkdir_if_not_exists diff --git a/src/archivers/instagram_archiverv2.py b/src/auto_archiver/archivers/instagram_archiverv2.py similarity index 98% rename from src/archivers/instagram_archiverv2.py rename to src/auto_archiver/archivers/instagram_archiverv2.py index 2ca2e80..126622a 100644 --- a/src/archivers/instagram_archiverv2.py +++ b/src/auto_archiver/archivers/instagram_archiverv2.py @@ -2,10 +2,9 @@ import re, os, shutil, html, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from metadata import Metadata -from media import Media -from .archiver import Archiverv2 - +from . import Archiverv2 +from ..core import Metadata +from ..core import Media class InstagramArchiver(Archiverv2): """ diff --git a/src/archivers/telegram_archiverv2.py b/src/auto_archiver/archivers/telegram_archiverv2.py similarity index 96% rename from src/archivers/telegram_archiverv2.py rename to src/auto_archiver/archivers/telegram_archiverv2.py index 674fa26..2f4bf23 100644 --- a/src/archivers/telegram_archiverv2.py +++ b/src/auto_archiver/archivers/telegram_archiverv2.py @@ -4,9 +4,9 @@ import html from bs4 import BeautifulSoup from loguru import logger -from metadata import Metadata -from media import Media -from .archiver import Archiverv2 +from . import Archiverv2 +from ..core import Metadata +from ..core import Media class TelegramArchiver(Archiverv2): diff --git a/src/archivers/telethon_archiverv2.py b/src/auto_archiver/archivers/telethon_archiverv2.py similarity index 99% rename from src/archivers/telethon_archiverv2.py rename to src/auto_archiver/archivers/telethon_archiverv2.py index 3f698e1..57d27e1 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/auto_archiver/archivers/telethon_archiverv2.py @@ -1,5 +1,4 @@ -from archivers import Archiverv2 -from metadata import Metadata + from telethon.sync import TelegramClient from telethon.errors import ChannelInvalidError from telethon.tl.types import PeerUser, PeerChat, PeerChannel @@ -9,7 +8,9 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from media import Media +from . import Archiverv2 +from ..core import Metadata +from ..core import Media class TelethonArchiver(Archiverv2): diff --git a/src/archivers/tiktok_archiverv2.py b/src/auto_archiver/archivers/tiktok_archiverv2.py similarity index 95% rename from src/archivers/tiktok_archiverv2.py rename to src/auto_archiver/archivers/tiktok_archiverv2.py index 85d3083..ea7f670 100644 --- a/src/archivers/tiktok_archiverv2.py +++ b/src/auto_archiver/archivers/tiktok_archiverv2.py @@ -5,9 +5,9 @@ import uuid import tiktok_downloader from loguru import logger -from metadata import Metadata -from media import Media -from .archiver import Archiverv2 +from . import Archiverv2 +from ..core import Metadata +from ..core import Media class TiktokArchiver(Archiverv2): diff --git a/src/archivers/twitter_api_archiverv2.py b/src/auto_archiver/archivers/twitter_api_archiverv2.py similarity index 98% rename from src/archivers/twitter_api_archiverv2.py rename to src/auto_archiver/archivers/twitter_api_archiverv2.py index 1f43935..007193c 100644 --- a/src/archivers/twitter_api_archiverv2.py +++ b/src/auto_archiver/archivers/twitter_api_archiverv2.py @@ -7,10 +7,10 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from metadata import Metadata -from media import Media +from . import Archiverv2 from .twitter_archiverv2 import TwitterArchiver -from .archiver import Archiverv2 +from ..core import Metadata +from ..core import Media class TwitterApiArchiver(TwitterArchiver, Archiverv2): diff --git a/src/archivers/twitter_archiverv2.py b/src/auto_archiver/archivers/twitter_archiverv2.py similarity index 98% rename from src/archivers/twitter_archiverv2.py rename to src/auto_archiver/archivers/twitter_archiverv2.py index d537fe4..58942ab 100644 --- a/src/archivers/twitter_archiverv2.py +++ b/src/auto_archiver/archivers/twitter_archiverv2.py @@ -4,12 +4,12 @@ import json import os from datetime import datetime from loguru import logger -from metadata import Metadata from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo -from archivers import Archiverv2 -from media import Media from slugify import slugify +from . import Archiverv2 +from ..core import Metadata +from ..core import Media class TwitterArchiver(Archiverv2): """ diff --git a/src/archivers/vk_archiverv2.py b/src/auto_archiver/archivers/vk_archiverv2.py similarity index 93% rename from src/archivers/vk_archiverv2.py rename to src/auto_archiver/archivers/vk_archiverv2.py index 32b6cec..4c2ad98 100644 --- a/src/archivers/vk_archiverv2.py +++ b/src/auto_archiver/archivers/vk_archiverv2.py @@ -1,10 +1,10 @@ from loguru import logger from vk_url_scraper import VkScraper -from utils.misc import dump_payload -from metadata import Metadata -from media import Media -from .archiver import Archiverv2 +from ..utils.misc import dump_payload +from . import Archiverv2 +from ..core import Metadata +from ..core import Media class VkArchiver(Archiverv2): diff --git a/src/archivers/youtubedl_archiverv2.py b/src/auto_archiver/archivers/youtubedl_archiverv2.py similarity index 96% rename from src/archivers/youtubedl_archiverv2.py rename to src/auto_archiver/archivers/youtubedl_archiverv2.py index 6d26de6..0b5916c 100644 --- a/src/archivers/youtubedl_archiverv2.py +++ b/src/auto_archiver/archivers/youtubedl_archiverv2.py @@ -4,9 +4,9 @@ import os import yt_dlp from loguru import logger -from metadata import Metadata -from media import Media -from .archiver import Archiverv2 +from . import Archiverv2 +from ..core import Metadata +from ..core import Media class YoutubeDLArchiver(Archiverv2): diff --git a/src/auto_archive.py b/src/auto_archiver/auto_archive.py similarity index 100% rename from src/auto_archive.py rename to src/auto_archiver/auto_archive.py diff --git a/src/auto_auto_archive.py b/src/auto_archiver/auto_auto_archive.py similarity index 100% rename from src/auto_auto_archive.py rename to src/auto_archiver/auto_auto_archive.py diff --git a/src/cli.py b/src/auto_archiver/cli.py similarity index 100% rename from src/cli.py rename to src/auto_archiver/cli.py diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py new file mode 100644 index 0000000..a171478 --- /dev/null +++ b/src/auto_archiver/core/__init__.py @@ -0,0 +1,7 @@ +from .media import Media +from .metadata import Metadata +from .step import Step + +# cannot import ArchivingOrchestrator/Config to avoid circular dep +# from .orchestrator import ArchivingOrchestrator +# from .v2config import ConfigV2 \ No newline at end of file diff --git a/src/media.py b/src/auto_archiver/core/media.py similarity index 93% rename from src/media.py rename to src/auto_archiver/core/media.py index 779ee88..40cab1f 100644 --- a/src/media.py +++ b/src/auto_archiver/core/media.py @@ -3,9 +3,11 @@ from __future__ import annotations from ast import List from typing import Any from dataclasses import dataclass, field +from dataclasses_json import dataclass_json import mimetypes - +# annotation order matters +@dataclass_json @dataclass class Media: filename: str diff --git a/src/metadata.py b/src/auto_archiver/core/metadata.py similarity index 98% rename from src/metadata.py rename to src/auto_archiver/core/metadata.py index 2293eb3..bb3079c 100644 --- a/src/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -3,13 +3,15 @@ from __future__ import annotations from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field +from dataclasses_json import dataclass_json import datetime, mimetypes from urllib.parse import urlparse from loguru import logger from dateutil.parser import parse as parse_dt -from media import Media - +from .media import Media +# annotation order matters +@dataclass_json @dataclass class Metadata: status: str = "no archiver" diff --git a/src/orchestrator.py b/src/auto_archiver/core/orchestrator.py similarity index 72% rename from src/orchestrator.py rename to src/auto_archiver/core/orchestrator.py index fb28cfb..19a35aa 100644 --- a/src/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -3,14 +3,14 @@ from ast import List from typing import Union, Dict from dataclasses import dataclass -from archivers import Archiverv2 -from feeders import Feeder -from formatters import Formatter -from media import Media -from storages import StorageV2 -from enrichers import Enricher -from databases import Database -from metadata import Metadata +from ..archivers import Archiverv2 +from ..feeders import Feeder +from ..formatters import Formatter +from ..storages import StorageV2 +from ..enrichers import Enricher +from ..databases import Database +from .media import Media +from .metadata import Metadata import tempfile, time, traceback from loguru import logger @@ -56,17 +56,6 @@ Cisticola considerations: class ArchivingOrchestrator: def __init__(self, config) -> None: - # in config.py we should test that the archivers exist and log mismatches (blocking execution) - # identify each formatter, storage, database, etc - # self.feeder = Feeder.init(config.feeder, config.get(config.feeder)) - - # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI - # where does that update/processing happen? in config.py - # reflection for Archiver to know which child classes it has? use Archiver.__subclasses__ - # self.archivers = [ - # Archiver.init(a, config) - # for a in config.archivers - # ] self.feeder: Feeder = config.feeder self.formatter: Formatter = config.formatter self.enrichers = config.enrichers @@ -76,50 +65,32 @@ class ArchivingOrchestrator: for a in self.archivers: a.setup() - self.formatters = [] - # self.formatters = [ - # Formatter.init(f, config) - # for f in config.formatters - # ] - - # self.storages = [ - # Storage.init(s, config) - # for s in config.storages - # ] - - # self.databases = [ - # Database.init(f, config) - # for f in config.formatters - # ] - - # these rules are checked in config.py - # assert len(archivers) > 1, "there needs to be at least one Archiver" - - def feed(self) -> list(Metadata): + def feed(self) -> None: for item in self.feeder: - print("ARCHIVING", item) - try: - with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - item.set_tmp_dir(tmp_dir) - result = self.archive(item) - print(result) - except KeyboardInterrupt: - # catches keyboard interruptions to do a clean exit - logger.warning(f"caught interrupt on {item=}") - for d in self.databases: d.aborted(item) - exit() - except Exception as e: - logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') - for d in self.databases: d.failed(item) + self.feed_item(item) - print("holding on 5s") - time.sleep(5) + def feed_item(self, item:Metadata) -> Metadata: + print("ARCHIVING", item) + try: + with tempfile.TemporaryDirectory(dir="./") as tmp_dir: + item.set_tmp_dir(tmp_dir) + result = self.archive(item) + except KeyboardInterrupt: + # catches keyboard interruptions to do a clean exit + logger.warning(f"caught interrupt on {item=}") + for d in self.databases: d.aborted(item) + exit() + except Exception as e: + logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') + for d in self.databases: d.failed(item) - # how does this handle the parameters like folder which can be different for each archiver? - # the storage needs to know where to archive!! - # solution: feeders have context: extra metadata that they can read or ignore, - # all of it should have sensible defaults (eg: folder) - # default feeder is a list with 1 element + return result + + # how does this handle the parameters like folder which can be different for each archiver? + # the storage needs to know where to archive!! + # solution: feeders have context: extra metadata that they can read or ignore, + # all of it should have sensible defaults (eg: folder) + # default feeder is a list with 1 element def archive(self, result: Metadata) -> Union[Metadata, None]: url = result.get_url() diff --git a/src/steps/step.py b/src/auto_archiver/core/step.py similarity index 97% rename from src/steps/step.py rename to src/auto_archiver/core/step.py index e80437b..ae10869 100644 --- a/src/steps/step.py +++ b/src/auto_archiver/core/step.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass, field from inspect import ClassFoundException from typing import Type -from metadata import Metadata +from ..core import Metadata from abc import ABC # from collections.abc import Iterable diff --git a/src/configs/v2config.py b/src/auto_archiver/core/v2config.py similarity index 66% rename from src/configs/v2config.py rename to src/auto_archiver/core/v2config.py index 7b0820d..3db6618 100644 --- a/src/configs/v2config.py +++ b/src/auto_archiver/core/v2config.py @@ -3,15 +3,16 @@ import argparse, yaml from dataclasses import dataclass, field from typing import List -from archivers import Archiverv2 -from feeders import Feeder -from databases import Database -from formatters import Formatter -from storages import StorageV2 -from steps.step import Step -from enrichers import Enricher from collections import defaultdict +from ..archivers import Archiverv2 +from ..feeders import Feeder +from ..databases import Database +from ..formatters import Formatter +from ..storages import StorageV2 +from . import Step +from ..enrichers import Enricher + @dataclass class ConfigV2: @@ -37,17 +38,22 @@ class ConfigV2: self.defaults = {} self.cli_ops = {} self.config = {} - # TODO: make this work for nested props like gsheet_feeder.columns.url = "URL" - def parse(self): - # 1. parse CLI values - parser = argparse.ArgumentParser( - # prog = "auto-archiver", - description="Auto Archiver is a ...!", - epilog="Check the code at https://github.com/bellingcat/auto-archiver" - ) - parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml') + def parse(self, use_cli=True, yaml_config_filename: str = None): + """ + if yaml_config_filename is provided, the --config argument is ignored, + useful for library usage when the config values are preloaded + """ + # 1. parse CLI values + if use_cli: + parser = argparse.ArgumentParser( + # prog = "auto-archiver", + description="Auto Archiver is a ...!", # TODO: update + epilog="Check the code at https://github.com/bellingcat/auto-archiver" + ) + + parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml') for configurable in self.configurable_parents: child: Step @@ -57,28 +63,32 @@ class ConfigV2: assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" config_path = f"{child.name}.{config}" - try: - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) - except argparse.ArgumentError: - # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver - pass + + if use_cli: + try: + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) + except argparse.ArgumentError: + # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver + pass self.defaults[config_path] = details["default"] if "cli_set" in details: self.cli_ops[config_path] = details["cli_set"] - args = parser.parse_args() + if use_cli: + args = parser.parse_args() + yaml_config_filename = yaml_config_filename or getattr(args, "config") + else: args = {} - # 2. read YAML config file - with open(args.config, "r", encoding="utf-8") as inf: - self.yaml_config = yaml.safe_load(inf) + # 2. read YAML config file (or use provided value) + self.yaml_config = self.read_yaml(yaml_config_filename) # print(f"{self.yaml_config.get('configurations', {})=}") # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default self.config = defaultdict(dict) for config_path, default in self.defaults.items(): child, config = tuple(config_path.split(".")) - val = getattr(args, config_path) + val = getattr(args, config_path, None) if val is not None and config_path in self.cli_ops: val = self.cli_ops[config_path](val, default) if val is None: @@ -108,5 +118,6 @@ class ConfigV2: print("storages", [e for e in self.storages]) print("formatter", self.formatter) - def validate(self): - pass + def read_yaml(self, yaml_filename: str) -> dict: + with open(yaml_filename, "r", encoding="utf-8") as inf: + return yaml.safe_load(inf) diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py new file mode 100644 index 0000000..3a3e907 --- /dev/null +++ b/src/auto_archiver/databases/__init__.py @@ -0,0 +1,3 @@ +from .database import Database +from .gsheet_db import GsheetsDb +from .console_db import ConsoleDb \ No newline at end of file diff --git a/src/auto_archiver/databases/console_db.py b/src/auto_archiver/databases/console_db.py new file mode 100644 index 0000000..a22bc8e --- /dev/null +++ b/src/auto_archiver/databases/console_db.py @@ -0,0 +1,32 @@ +from loguru import logger + +from . import Database +from ..core import Metadata + + +class ConsoleDb(Database): + """ + Outputs results to the console + """ + name = "console_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def started(self, item: Metadata) -> None: + logger.warning(f"STARTED {item}") + + def failed(self, item: Metadata) -> None: + logger.error(f"FAILED {item}") + + def aborted(self, item: Metadata) -> None: + logger.warning(f"ABORTED {item}") + + def done(self, item: Metadata) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item}") \ No newline at end of file diff --git a/src/databases/database.py b/src/auto_archiver/databases/database.py similarity index 95% rename from src/databases/database.py rename to src/auto_archiver/databases/database.py index 94b2178..3f09c4a 100644 --- a/src/databases/database.py +++ b/src/auto_archiver/databases/database.py @@ -2,8 +2,8 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC from typing import Union -from metadata import Metadata -from steps.step import Step +from ..core import Metadata +from ..core import Step @dataclass diff --git a/src/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py similarity index 96% rename from src/databases/gsheet_db.py rename to src/auto_archiver/databases/gsheet_db.py index 0cf65ed..13660ca 100644 --- a/src/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -5,11 +5,11 @@ import gspread, datetime from loguru import logger # from . import Enricher -from databases import Database -from metadata import Metadata -from media import Media -from steps.gsheet import Gsheets -from utils import GWorksheet +from . import Database +from ..core import Metadata +from ..core import Media +from ..utils import Gsheets +from ..utils import GWorksheet class GsheetsDb(Database): @@ -91,7 +91,7 @@ class GsheetsDb(Database): logger.debug(f"Unable to update sheet: {e}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now gw: GWorksheet = item.get("gsheet").get("worksheet") row: int = item.get("gsheet").get("row") - # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now return gw, row diff --git a/src/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py similarity index 100% rename from src/enrichers/__init__.py rename to src/auto_archiver/enrichers/__init__.py diff --git a/src/enrichers/enricher.py b/src/auto_archiver/enrichers/enricher.py similarity index 89% rename from src/enrichers/enricher.py rename to src/auto_archiver/enrichers/enricher.py index 9d11276..f67d9fe 100644 --- a/src/enrichers/enricher.py +++ b/src/auto_archiver/enrichers/enricher.py @@ -1,8 +1,8 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC -from metadata import Metadata -from steps.step import Step +from ..core import Metadata +from ..core import Step @dataclass class Enricher(Step, ABC): diff --git a/src/enrichers/hash_enricher.py b/src/auto_archiver/enrichers/hash_enricher.py similarity index 96% rename from src/enrichers/hash_enricher.py rename to src/auto_archiver/enrichers/hash_enricher.py index 786c861..7970b17 100644 --- a/src/enrichers/hash_enricher.py +++ b/src/auto_archiver/enrichers/hash_enricher.py @@ -1,11 +1,12 @@ import hashlib -from utils import Webdriver -from . import Enricher -from metadata import Metadata from loguru import logger from selenium.common.exceptions import TimeoutException import time, requests +from . import Enricher +from ..utils import Webdriver +from ..core import Metadata + class HashEnricher(Enricher): """ diff --git a/src/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py similarity index 94% rename from src/enrichers/screenshot_enricher.py rename to src/auto_archiver/enrichers/screenshot_enricher.py index 0375e3b..0ae2e29 100644 --- a/src/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -1,11 +1,11 @@ -from media import Media -from utils import Webdriver -from . import Enricher -from metadata import Metadata from loguru import logger import time, uuid, os from selenium.common.exceptions import TimeoutException +from . import Enricher +from ..utils import Webdriver +from ..core import Media +from ..core import Metadata class ScreenshotEnricher(Enricher): name = "screenshot_enricher" diff --git a/src/enrichers/thumbnail_enricher.py b/src/auto_archiver/enrichers/thumbnail_enricher.py similarity index 96% rename from src/enrichers/thumbnail_enricher.py rename to src/auto_archiver/enrichers/thumbnail_enricher.py index 32e09be..f1a3149 100644 --- a/src/enrichers/thumbnail_enricher.py +++ b/src/auto_archiver/enrichers/thumbnail_enricher.py @@ -1,10 +1,10 @@ import uuid -from media import Media -from . import Enricher -from metadata import Metadata from loguru import logger import ffmpeg, os +from . import Enricher +from ..core import Media +from ..core import Metadata class ThumbnailEnricher(Enricher): """ diff --git a/src/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py similarity index 96% rename from src/enrichers/wacz_enricher.py rename to src/auto_archiver/enrichers/wacz_enricher.py index 1fa3191..504b6ec 100644 --- a/src/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -2,13 +2,13 @@ import os import shutil import subprocess import uuid -from archivers.archiver import Archiverv2 -from media import Media -from . import Enricher -from metadata import Metadata from loguru import logger import time, requests +from ..core import Media +from ..core import Metadata +from . import Enricher + class WaczEnricher(Enricher): """ diff --git a/src/enrichers/wayback_enricher.py b/src/auto_archiver/enrichers/wayback_enricher.py similarity index 98% rename from src/enrichers/wayback_enricher.py rename to src/auto_archiver/enrichers/wayback_enricher.py index db53a08..945b148 100644 --- a/src/enrichers/wayback_enricher.py +++ b/src/auto_archiver/enrichers/wayback_enricher.py @@ -1,9 +1,9 @@ -from archivers.archiver import Archiverv2 -from . import Enricher -from metadata import Metadata from loguru import logger import time, requests +from . import Enricher +from ..archivers import Archiverv2 +from ..core import Metadata class WaybackArchiverEnricher(Enricher, Archiverv2): """ diff --git a/src/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py similarity index 100% rename from src/feeders/__init__.py rename to src/auto_archiver/feeders/__init__.py diff --git a/src/feeders/feeder.py b/src/auto_archiver/feeders/feeder.py similarity index 80% rename from src/feeders/feeder.py rename to src/auto_archiver/feeders/feeder.py index bccfab8..4aa263f 100644 --- a/src/feeders/feeder.py +++ b/src/auto_archiver/feeders/feeder.py @@ -1,8 +1,8 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from metadata import Metadata -from steps.step import Step +from ..core import Metadata +from ..core import Step @dataclass diff --git a/src/feeders/gsheet_feeder.py b/src/auto_archiver/feeders/gsheet_feeder.py similarity index 96% rename from src/feeders/gsheet_feeder.py rename to src/auto_archiver/feeders/gsheet_feeder.py index 029813f..19b1fbc 100644 --- a/src/feeders/gsheet_feeder.py +++ b/src/auto_archiver/feeders/gsheet_feeder.py @@ -2,13 +2,13 @@ import gspread, os # from metadata import Metadata from loguru import logger +from slugify import slugify # from . import Enricher -from feeders import Feeder -from metadata import Metadata -from steps.gsheet import Gsheets -from utils import GWorksheet -from slugify import slugify +from . import Feeder +from ..core import Metadata +from ..utils import Gsheets +from ..utils import GWorksheet class GsheetsFeeder(Gsheets, Feeder): name = "gsheet_feeder" diff --git a/src/formatters/__init__.py b/src/auto_archiver/formatters/__init__.py similarity index 100% rename from src/formatters/__init__.py rename to src/auto_archiver/formatters/__init__.py diff --git a/src/formatters/formatter.py b/src/auto_archiver/formatters/formatter.py similarity index 80% rename from src/formatters/formatter.py rename to src/auto_archiver/formatters/formatter.py index 7199be2..80d5d06 100644 --- a/src/formatters/formatter.py +++ b/src/auto_archiver/formatters/formatter.py @@ -1,8 +1,8 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from metadata import Metadata -from steps.step import Step +from ..core import Metadata +from ..core import Step @dataclass diff --git a/src/formatters/html_formatter.py b/src/auto_archiver/formatters/html_formatter.py similarity index 96% rename from src/formatters/html_formatter.py rename to src/auto_archiver/formatters/html_formatter.py index a78ff2b..9f5017f 100644 --- a/src/formatters/html_formatter.py +++ b/src/auto_archiver/formatters/html_formatter.py @@ -2,12 +2,13 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod import mimetypes -from metadata import Metadata -from media import Media -from formatters import Formatter from jinja2 import Environment, FileSystemLoader import uuid, os, pathlib +from ..core import Metadata +from ..core import Media +from . import Formatter + @dataclass class HtmlFormatter(Formatter): @@ -72,5 +73,6 @@ def is_audio_jinja(s: str) -> bool: m = mimetypes.guess_type(s)[0] return "audio" in (m or "") + def is_media_jinja(v) -> bool: return isinstance(v, Media) diff --git a/src/formatters/templates/html_template.html b/src/auto_archiver/formatters/templates/html_template.html similarity index 100% rename from src/formatters/templates/html_template.html rename to src/auto_archiver/formatters/templates/html_template.html diff --git a/src/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html similarity index 100% rename from src/formatters/templates/macros.html rename to src/auto_archiver/formatters/templates/macros.html diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py new file mode 100644 index 0000000..7094d25 --- /dev/null +++ b/src/auto_archiver/storages/__init__.py @@ -0,0 +1,9 @@ +# we need to explicitly expose the available imports here +from .base_storage import Storage +# from .local_storage import LocalStorage, LocalConfig +# from .s3_storage import S3Config, S3Storage +# from .gd_storage import GDConfig, GDStorage + +from .storage import StorageV2 +from .s3 import S3StorageV2 +from .local import LocalStorageV2 \ No newline at end of file diff --git a/src/storages/base_storage.py b/src/auto_archiver/storages/base_storage.py similarity index 100% rename from src/storages/base_storage.py rename to src/auto_archiver/storages/base_storage.py diff --git a/src/storages/gd_storage.py b/src/auto_archiver/storages/gd_storage.py similarity index 100% rename from src/storages/gd_storage.py rename to src/auto_archiver/storages/gd_storage.py diff --git a/src/storages/local.py b/src/auto_archiver/storages/local.py similarity index 94% rename from src/storages/local.py rename to src/auto_archiver/storages/local.py index aafb28c..1d44e6f 100644 --- a/src/storages/local.py +++ b/src/auto_archiver/storages/local.py @@ -3,12 +3,13 @@ import shutil from typing import IO, Any import boto3, uuid, os, mimetypes from botocore.errorfactory import ClientError -from metadata import Metadata -from media import Media -from storages import StorageV2 from loguru import logger from slugify import slugify +from ..core import Metadata +from ..core import Media +from ..storages import StorageV2 + class LocalStorageV2(StorageV2): name = "local_storage" diff --git a/src/storages/s3.py b/src/auto_archiver/storages/s3.py similarity index 97% rename from src/storages/s3.py rename to src/auto_archiver/storages/s3.py index 1ae7e38..25eb324 100644 --- a/src/storages/s3.py +++ b/src/auto_archiver/storages/s3.py @@ -2,9 +2,9 @@ from typing import IO, Any import boto3, uuid, os, mimetypes from botocore.errorfactory import ClientError -from metadata import Metadata -from media import Media -from storages import StorageV2 +from ..core import Metadata +from ..core import Media +from ..storages import StorageV2 from loguru import logger from slugify import slugify diff --git a/src/storages/s3_storage.py b/src/auto_archiver/storages/s3_storage.py similarity index 100% rename from src/storages/s3_storage.py rename to src/auto_archiver/storages/s3_storage.py diff --git a/src/storages/storage.py b/src/auto_archiver/storages/storage.py similarity index 95% rename from src/storages/storage.py rename to src/auto_archiver/storages/storage.py index 61d4c77..9d6a005 100644 --- a/src/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -2,9 +2,8 @@ from __future__ import annotations from abc import abstractmethod from dataclasses import dataclass from typing import IO, Any -from media import Media -from metadata import Metadata -from steps.step import Step + +from ..core import Media, Metadata, Step from loguru import logger import os, uuid from slugify import slugify diff --git a/src/utils/__init__.py b/src/auto_archiver/utils/__init__.py similarity index 69% rename from src/utils/__init__.py rename to src/auto_archiver/utils/__init__.py index 9aff525..a20e191 100644 --- a/src/utils/__init__.py +++ b/src/auto_archiver/utils/__init__.py @@ -2,4 +2,5 @@ from .gworksheet import GWorksheet from .misc import * from .util import Util -from .webdriver import Webdriver \ No newline at end of file +from .webdriver import Webdriver +from .gsheet import Gsheets \ No newline at end of file diff --git a/src/steps/gsheet.py b/src/auto_archiver/utils/gsheet.py similarity index 98% rename from src/steps/gsheet.py rename to src/auto_archiver/utils/gsheet.py index 262add1..98259f8 100644 --- a/src/steps/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -1,7 +1,7 @@ import json, gspread from loguru import logger -from steps.step import Step +from ..core import Step class Gsheets(Step): diff --git a/src/utils/gworksheet.py b/src/auto_archiver/utils/gworksheet.py similarity index 100% rename from src/utils/gworksheet.py rename to src/auto_archiver/utils/gworksheet.py diff --git a/src/utils/misc.py b/src/auto_archiver/utils/misc.py similarity index 100% rename from src/utils/misc.py rename to src/auto_archiver/utils/misc.py diff --git a/src/utils/util.py b/src/auto_archiver/utils/util.py similarity index 88% rename from src/utils/util.py rename to src/auto_archiver/utils/util.py index e465bda..898b260 100644 --- a/src/utils/util.py +++ b/src/auto_archiver/utils/util.py @@ -1,8 +1,7 @@ from __future__ import annotations from abc import abstractmethod from dataclasses import dataclass -from metadata import Metadata -from steps.step import Step +from ..core import Metadata, Step #TODO: likely unused @dataclass diff --git a/src/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py similarity index 100% rename from src/utils/webdriver.py rename to src/auto_archiver/utils/webdriver.py diff --git a/src/configs/__init__.py b/src/configs/__init__.py deleted file mode 100644 index 1f01b62..0000000 --- a/src/configs/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .config import Config -from .selenium_config import SeleniumConfig -from .telethon_config import TelethonConfig -from .wayback_config import WaybackConfig -from .twitter_api_config import TwitterApiConfig -from .vk_config import VkConfig -from .instagram_config import InstagramConfig \ No newline at end of file diff --git a/src/configs/browsertrix_config.py b/src/configs/browsertrix_config.py deleted file mode 100644 index bc7acd5..0000000 --- a/src/configs/browsertrix_config.py +++ /dev/null @@ -1,7 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class BrowsertrixConfig: - enabled: bool - profile: str - timeout_seconds: str diff --git a/src/configs/config.py b/src/configs/config.py deleted file mode 100644 index bbd385e..0000000 --- a/src/configs/config.py +++ /dev/null @@ -1,309 +0,0 @@ -import argparse, yaml, json, os -import gspread -from loguru import logger -from selenium import webdriver -from dataclasses import asdict -from selenium.common.exceptions import TimeoutException - -from utils import GWorksheet, getattr_or -from .wayback_config import WaybackConfig -from .telethon_config import TelethonConfig -from .selenium_config import SeleniumConfig -from .vk_config import VkConfig -from .twitter_api_config import TwitterApiConfig -from .browsertrix_config import BrowsertrixConfig -from .instagram_config import InstagramConfig -from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig - - -class Config: - """ - Controls the current execution parameters and manages API configurations - Usage: - c = Config() # initializes the argument parser - c.parse() # parses the values and initializes the Services and API clients - # you can then access the Services and APIs like 'c.s3_config' - All the configurations available as cmd line options, when included, will - override the configurations in the config.yaml file. - Configurations are split between: - 1. "secrets" containing API keys for generating services - not kept in memory - 2. "execution" containing specific execution configurations - """ - AVAILABLE_STORAGES = {"s3", "gd", "local"} - - def __init__(self): - self.parser = self.get_argument_parser() - self.folder = "" - self.is_docker = bool(os.environ.get("IS_DOCKER", 0)) - - def parse(self): - self.args = self.parser.parse_args() - logger.success(f'Command line arguments parsed successfully') - self.config_file = self.args.config - self.read_config_yaml() - logger.info(f'APIs and Services initialized:\n{self}') - - def read_config_yaml(self): - with open(self.config_file, "r", encoding="utf-8") as inf: - self.config = yaml.safe_load(inf) - - self.url = getattr_or(self.args, "url", '') - - # ----------------------EXECUTION - execution configurations - execution = self.config.get("execution", {}) - - self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) - assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" - - def ensure_set(l): - # always returns a set of strings, can receive a set or a string - l = l if isinstance(l, list) else [l] - return set([x for x in l if isinstance(x, str) and len(x) > 0]) - self.worksheet_allow = ensure_set(execution.get("worksheet_allow", [])) - self.worksheet_block = ensure_set(execution.get("worksheet_block", [])) - - self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) - self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) - self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False) - if self.save_logs: - self.set_log_files() - self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False) - - # Column names come from config and can be overwritten by CMD - # in the end all are considered as lower case - config_column_names = execution.get("column_names", {}) - self.column_names = {} - for k in GWorksheet.COLUMN_NAMES.keys(): - self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower() - - # selenium driver - selenium_configs = execution.get("selenium", {}) - self.selenium_config = SeleniumConfig( - timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)), - window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)), - window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height)) - ) - self.webdriver = "not initialized" - - # browsertrix config - browsertrix_configs = execution.get("browsertrix", {}) - if len(browsertrix_profile := browsertrix_configs.get("profile", "")): - browsertrix_profile = os.path.abspath(browsertrix_profile) - self.browsertrix_config = BrowsertrixConfig( - enabled=bool(browsertrix_configs.get("enabled", False)), - profile=browsertrix_profile, - timeout_seconds=browsertrix_configs.get("timeout_seconds", "90") - ) - - self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") - - # ---------------------- SECRETS - APIs and service configurations - secrets = self.config.get("secrets", {}) - - # assert selected storage credentials exist - for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]: - assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}" - - # google sheets config - self.gsheets_client = gspread.service_account( - filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json') - ) - - # facebook config - self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None) - - # s3 config - if "s3" in secrets: - s3 = secrets["s3"] - self.s3_config = S3Config( - bucket=s3["bucket"], - region=s3["region"], - key=s3["key"], - secret=s3["secret"], - endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url), - cdn_url=s3.get("cdn_url", S3Config.cdn_url), - key_path=s3.get("key_path", S3Config.key_path), - private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private)) - ) - - # GDrive config - if "google_drive" in secrets: - gd = secrets["google_drive"] - self.gd_config = GDConfig( - root_folder_id=gd.get("root_folder_id"), - oauth_token_filename=gd.get("oauth_token_filename"), - service_account=gd.get("service_account", GDConfig.service_account) - ) - - if "local" in secrets: - self.local_config = LocalConfig( - save_to=secrets["local"].get("save_to", LocalConfig.save_to), - ) - - # wayback machine config - if "wayback" in secrets: - self.wayback_config = WaybackConfig( - key=secrets["wayback"]["key"], - secret=secrets["wayback"]["secret"], - ) - else: - self.wayback_config = None - logger.debug(f"'wayback' key not present in the {self.config_file=}") - - # telethon config - if "telegram" in secrets: - self.telegram_config = TelethonConfig( - api_id=secrets["telegram"]["api_id"], - api_hash=secrets["telegram"]["api_hash"], - bot_token=secrets["telegram"].get("bot_token", None), - session_file=secrets["telegram"].get("session_file", "./anon") - ) - else: - self.telegram_config = None - logger.debug(f"'telegram' key not present in the {self.config_file=}") - - # twitter config - if "twitter" in secrets: - self.twitter_config = TwitterApiConfig( - bearer_token=secrets["twitter"].get("bearer_token"), - consumer_key=secrets["twitter"].get("consumer_key"), - consumer_secret=secrets["twitter"].get("consumer_secret"), - access_token=secrets["twitter"].get("access_token"), - access_secret=secrets["twitter"].get("access_secret"), - ) - else: - self.twitter_config = None - logger.debug(f"'twitter' key not present in the {self.config_file=}") - - # vk config - if "vk" in secrets: - self.vk_config = VkConfig( - username=secrets["vk"]["username"], - password=secrets["vk"]["password"], - session_file=secrets["vk"].get("session_file", "./vk_config.v2.json") - ) - else: - self.vk_config = None - logger.debug(f"'vk' key not present in the {self.config_file=}") - - # instagram config - if "instagram" in secrets: - self.instagram_config = InstagramConfig( - username=secrets["instagram"]["username"], - password=secrets["instagram"]["password"], - session_file=secrets["instagram"].get("session_file", "instaloader.session") - ) - else: - self.instagram_config = None - logger.debug(f"'instagram' key not present in the {self.config_file=}") - - del self.config["secrets"] # delete to prevent leaks - - def set_log_files(self): - # called only when config.execution.save_logs=true - logger.add("logs/1trace.log", level="TRACE") - logger.add("logs/2info.log", level="INFO") - logger.add("logs/3success.log", level="SUCCESS") - logger.add("logs/4warning.log", level="WARNING") - logger.add("logs/5error.log", level="ERROR") - - def get_argument_parser(self): - """ - Creates the CMD line arguments. 'python auto_archive.py --help' - """ - parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ') - - parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction') - parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml') - parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES) - parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]') - parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]') - parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]') - parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]') - parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]') - - for k, v in GWorksheet.COLUMN_NAMES.items(): - help = f"the name of the column to FILL WITH {k} (default='{v}')" - if k in ["url", "folder"]: - help = f"the name of the column to READ {k} FROM (default='{v}')" - parser.add_argument(f'--col-{k}', action='store', dest=k, help=help) - - return parser - - def set_folder(self, folder): - """ - update the folder in each of the storages - """ - self.folder = folder - logger.info(f"setting folder to {folder}") - # s3 - if hasattr(self, "s3_config"): self.s3_config.folder = folder - if hasattr(self, "s3_storage"): self.s3_storage.folder = folder - # gdrive - if hasattr(self, "gd_config"): self.gd_config.folder = folder - if hasattr(self, "gd_storage"): self.gd_storage.folder = folder - # local - if hasattr(self, "local_config"): self.local_config.folder = folder - if hasattr(self, "local_storage"): self.local_storage.folder = folder - - def get_storage(self): - """ - returns the configured type of storage, creating if needed - """ - if self.storage == "s3": - self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config)) - return self.s3_storage - elif self.storage == "gd": - self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config)) - return self.gd_storage - elif self.storage == "local": - self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config)) - return self.local_storage - raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}" - - def destroy_webdriver(self): - if self.webdriver is not None and type(self.webdriver) != str: - self.webdriver.close() - self.webdriver.quit() - del self.webdriver - - def recreate_webdriver(self): - options = webdriver.FirefoxOptions() - options.headless = True - options.set_preference('network.protocol-handler.external.tg', False) - try: - new_webdriver = webdriver.Firefox(options=options) - # only destroy if creation is successful - self.destroy_webdriver() - self.webdriver = new_webdriver - self.webdriver.set_window_size(self.selenium_config.window_width, - self.selenium_config.window_height) - self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) - except TimeoutException as e: - logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") - - def __str__(self) -> str: - return json.dumps({ - "config_file": self.config_file, - "sheet": self.sheet, - "worksheet_allow": list(self.worksheet_allow), - "worksheet_block": list(self.worksheet_block), - "storage": self.storage, - "header": self.header, - "check_if_exists": self.check_if_exists, - "hash_algorithm": self.hash_algorithm, - "browsertrix_config": asdict(self.browsertrix_config), - "save_logs": self.save_logs, - "selenium_config": asdict(self.selenium_config), - "selenium_webdriver": self.webdriver != None, - "s3_config": hasattr(self, "s3_config"), - "s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None), - "gd_config": hasattr(self, "gd_config"), - "local_config": hasattr(self, "local_config"), - "wayback_config": self.wayback_config != None, - "telegram_config": self.telegram_config != None, - "twitter_config": self.twitter_config != None, - "vk_config": self.vk_config != None, - "gsheets_client": self.gsheets_client != None, - "column_names": self.column_names, - }, ensure_ascii=False, indent=4) diff --git a/src/configs/instagram_config.py b/src/configs/instagram_config.py deleted file mode 100644 index a9f26b4..0000000 --- a/src/configs/instagram_config.py +++ /dev/null @@ -1,9 +0,0 @@ - -from dataclasses import dataclass - - -@dataclass -class InstagramConfig: - username: str - password: str - session_file: str diff --git a/src/configs/selenium_config.py b/src/configs/selenium_config.py deleted file mode 100644 index 8e060af..0000000 --- a/src/configs/selenium_config.py +++ /dev/null @@ -1,8 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class SeleniumConfig: - timeout_seconds: int = 120 - window_width: int = 1400 - window_height: int = 2000 diff --git a/src/configs/telethon_config.py b/src/configs/telethon_config.py deleted file mode 100644 index 111c7bd..0000000 --- a/src/configs/telethon_config.py +++ /dev/null @@ -1,10 +0,0 @@ - -from dataclasses import dataclass - - -@dataclass -class TelethonConfig: - api_id: str - api_hash: str - bot_token: str - session_file: str diff --git a/src/configs/twitter_api_config.py b/src/configs/twitter_api_config.py deleted file mode 100644 index 4193111..0000000 --- a/src/configs/twitter_api_config.py +++ /dev/null @@ -1,11 +0,0 @@ - -from dataclasses import dataclass - - -@dataclass -class TwitterApiConfig: - bearer_token: str - consumer_key: str - consumer_secret: str - access_token: str - access_secret: str diff --git a/src/configs/vk_config.py b/src/configs/vk_config.py deleted file mode 100644 index 4c3472c..0000000 --- a/src/configs/vk_config.py +++ /dev/null @@ -1,9 +0,0 @@ - -from dataclasses import dataclass - - -@dataclass -class VkConfig: - username: str - password: str - session_file: str diff --git a/src/configs/wayback_config.py b/src/configs/wayback_config.py deleted file mode 100644 index 7770f66..0000000 --- a/src/configs/wayback_config.py +++ /dev/null @@ -1,8 +0,0 @@ - -from dataclasses import dataclass - - -@dataclass -class WaybackConfig: - key: str - secret: str diff --git a/src/databases/__init__.py b/src/databases/__init__.py deleted file mode 100644 index 17b9c6d..0000000 --- a/src/databases/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .database import Database -from .gsheet_db import GsheetsDb \ No newline at end of file diff --git a/src/storages/__init__.py b/src/storages/__init__.py deleted file mode 100644 index 4c0783c..0000000 --- a/src/storages/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# we need to explicitly expose the available imports here -from .base_storage import Storage -from .local_storage import LocalStorage, LocalConfig -from .s3_storage import S3Config, S3Storage -from .gd_storage import GDConfig, GDStorage - -from .storage import StorageV2 -from .s3 import S3StorageV2 -from .local import LocalStorageV2 \ No newline at end of file diff --git a/src/storages/local_storage.py b/src/storages/local_storage.py deleted file mode 100644 index 1109767..0000000 --- a/src/storages/local_storage.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -from dataclasses import dataclass -from loguru import logger - -from .base_storage import Storage -from utils import mkdir_if_not_exists - - -@dataclass -class LocalConfig: - folder: str = "" - save_to: str = "./" - -class LocalStorage(Storage): - def __init__(self, config:LocalConfig): - self.folder = config.folder - self.save_to = config.save_to - mkdir_if_not_exists(self.save_to) - - def get_cdn_url(self, key): - key = self.clean_key(key) - logger.info(f"{key=}") - full_path = os.path.join(self.save_to, self.folder, key) - logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}") - os.makedirs(os.path.dirname(full_path), exist_ok=True) - # mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1])) - return os.path.abspath(full_path) - - def exists(self, key): - return os.path.isfile(self.get_cdn_url(key)) - - def uploadf(self, file, key, **kwargs): - path = self.get_cdn_url(key) - with open(path, "wb") as outf: - outf.write(file.read()) diff --git a/src/v2.py b/src/v2.py deleted file mode 100644 index 8ecb820..0000000 --- a/src/v2.py +++ /dev/null @@ -1,12 +0,0 @@ - - -from abc import ABC -from configs.v2config import ConfigV2 -from orchestrator import ArchivingOrchestrator - -config = ConfigV2() -config.parse() - -orchestrator = ArchivingOrchestrator(config) - -orchestrator.feed()