Merge pull request #33 from bellingcat/refactor-configs

breaking changes: refactor configs + fixes
pull/42/head
Miguel Sozinho Ramalho 2022-06-16 16:19:28 +01:00 zatwierdzone przez GitHub
commit 4c6f3ea688
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
30 zmienionych plików z 1431 dodań i 703 usunięć

Wyświetl plik

@ -1,18 +0,0 @@
DO_SPACES_REGION=
DO_SPACES_KEY=
DO_SPACES_SECRET=
DO_BUCKET=
INTERNET_ARCHIVE_S3_KEY=
INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
FACEBOOK_COOKIE=cookie: datr= xxxx
# Google Drive, Right click on folder, Get link:
# https://drive.google.com/drive/folders/123456789987654321abcdefghijk?usp=sharing
# we want: 123456789987654321abcdefghijk
# Remember to share the folder with the service email
# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com
GD_ROOT_FOLDER_ID=

8
.gitignore vendored
Wyświetl plik

@ -10,4 +10,10 @@ anu.html
*.log
.pytest_cach
anon*
config*.json
config.json
config-*.json
config.yaml
config-*.yaml
logs/*
local_archive/
vk_config*.json

Wyświetl plik

@ -6,10 +6,9 @@ name = "pypi"
[packages]
gspread = "*"
boto3 = "*"
python-dotenv = "*"
argparse = "*"
beautifulsoup4 = "*"
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
tiktok-downloader = "*"
bs4 = "*"
loguru = "*"
ffmpeg-python = "*"
@ -21,6 +20,10 @@ google-api-python-client = "*"
google-auth-httplib2 = "*"
google-auth-oauthlib = "*"
oauth2client = "*"
python-slugify = "*"
pyyaml = "*"
vk-api = "*"
dateparser = "*"
[requires]
python_version = "3.9"

472
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "25b858227d74cc232bba525d34dcf30f15d18d535a6e9c59555e85a0a2bd8c61"
"sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd"
},
"pipfile-spec": 6,
"requires": {
@ -50,19 +50,19 @@
},
"boto3": {
"hashes": [
"sha256:3fb956d097105a0fb98c29a622ff233fa8de68519aabd7088d7ffd36dfc33214",
"sha256:b59a210fa6a87f0c755b40403ffc66b9b285680bbc5ad5245cf167e2def33620"
"sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4",
"sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8"
],
"index": "pypi",
"version": "==1.23.7"
"version": "==1.24.9"
},
"botocore": {
"hashes": [
"sha256:0f4a467188644382856e96e85bff0b453442d5cf0c0f554154571a6e2468a005",
"sha256:9f8d5e8d65b24d97fcb7804b84831e5627fceb52707167d2f496477675c98ded"
"sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad",
"sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6"
],
"markers": "python_version >= '3.6'",
"version": "==1.26.7"
"markers": "python_version >= '3.7'",
"version": "==1.27.9"
},
"brotli": {
"hashes": [
@ -141,18 +141,18 @@
},
"cachetools": {
"hashes": [
"sha256:4ebbd38701cdfd3603d1f751d851ed248ab4570929f2d8a7ce69e30c420b141c",
"sha256:8b3b8fa53f564762e5b221e9896798951e7f915513abf2ba072ce0f07f3f5a98"
"sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757",
"sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"
],
"markers": "python_version ~= '3.7'",
"version": "==5.1.0"
"version": "==5.2.0"
},
"certifi": {
"hashes": [
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==2022.5.18.1"
},
"cffi": {
@ -215,7 +215,7 @@
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"markers": "python_version >= '3.5'",
"version": "==2.0.12"
},
"click": {
@ -233,6 +233,13 @@
],
"version": "==1.2.60"
},
"commonmark": {
"hashes": [
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
],
"version": "==0.9.1"
},
"cryptography": {
"hashes": [
"sha256:093cb351031656d3ee2f4fa1be579a8c69c754cf874206be1d4cf3b542042804",
@ -260,13 +267,13 @@
],
"version": "==37.0.2"
},
"faker": {
"dateparser": {
"hashes": [
"sha256:c6ff91847d7c820afc0a74d95e824b48aab71ddfd9003f300641e42d58ae886f",
"sha256:cad1f69d72a68878cd67855140b6fe3e44c11628971cd838595d289c98bc45de"
"sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9",
"sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628"
],
"markers": "python_version >= '3.6'",
"version": "==13.11.1"
"index": "pypi",
"version": "==1.1.1"
},
"ffmpeg-python": {
"hashes": [
@ -278,11 +285,11 @@
},
"filelock": {
"hashes": [
"sha256:b795f1b42a61bbf8ec7113c341dad679d772567b936fbd1bf43c9a238e673e20",
"sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6"
"sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404",
"sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.0"
"version": "==3.7.1"
},
"flask": {
"hashes": [
@ -301,27 +308,27 @@
},
"google-api-core": {
"hashes": [
"sha256:065bb8e11c605fd232707ae50963dc1c8af5b3c95b4568887515985e6c1156b3",
"sha256:1b9f59236ce1bae9a687c1d4f22957e79a2669e53d032893f6bf0fca54f6931d"
"sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0",
"sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359"
],
"markers": "python_version >= '3.6'",
"version": "==2.8.0"
"markers": "python_full_version >= '3.6.0'",
"version": "==2.8.1"
},
"google-api-python-client": {
"hashes": [
"sha256:4527f7b8518a795624ab68da412d55628f83b98c67dd6a5d6edf725454f8b30b",
"sha256:600c43d7eac6e3536fdcad1d14ba9ee503edf4c7db0bd827e791bbf03b9f1330"
"sha256:a573373041b3f6ccbd04877b70e7425c52daec5b4fe5f440e8f5895c87d1a69c",
"sha256:b444f839bed289ecfe30950ea1cd15b7e7976d8cf9f0a3c778037ae3fb030df3"
],
"index": "pypi",
"version": "==2.48.0"
"version": "==2.51.0"
},
"google-auth": {
"hashes": [
"sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312",
"sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d"
"sha256:819b70140d05501739e1387291d39f0de3b4dff3b00ae4aff8e7a05369957f89",
"sha256:9b1da39ab8731c3061f36fefde9f8bb902dbee9eb28e3a67e8cfa7dc1be76227"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.6.6"
"version": "==2.8.0"
},
"google-auth-httplib2": {
"hashes": [
@ -333,34 +340,34 @@
},
"google-auth-oauthlib": {
"hashes": [
"sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
"sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
"sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f",
"sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae"
],
"index": "pypi",
"version": "==0.5.1"
"version": "==0.5.2"
},
"googleapis-common-protos": {
"hashes": [
"sha256:6b5ee59dc646eb61a8eb65ee1db186d3df6687c8804830024f32573298bca19b",
"sha256:ddcd955b5bb6589368f659fa475373faa1ed7d09cde5ba25e88513d87007e174"
"sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c",
"sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3"
],
"markers": "python_version >= '3.6'",
"version": "==1.56.1"
"markers": "python_full_version >= '3.6.0'",
"version": "==1.56.2"
},
"gspread": {
"hashes": [
"sha256:319766d90db05056293f7ee0ad2b35503a1a40683a75897a2922398cd2016283",
"sha256:c719e1c024a2a6f3b7d818fbe07c3886b26fd6504b64d1b1359cf242968213cd"
"sha256:21704b47d007c3b5fd34eddfa4c4a9dcd1ecc1dc615083b9c636127726e66c18",
"sha256:b6172b62fa899e3e4199d2d0ea1008b64305554ba08d3d3a96e9123824fdec48"
],
"index": "pypi",
"version": "==5.3.2"
"version": "==5.4.0"
},
"h11": {
"hashes": [
"sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06",
"sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==0.13.0"
},
"httplib2": {
@ -376,7 +383,7 @@
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"markers": "python_version >= '3.5'",
"version": "==3.3"
},
"importlib-metadata": {
@ -421,70 +428,72 @@
},
"lxml": {
"hashes": [
"sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
"sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
"sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
"sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
"sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
"sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
"sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
"sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
"sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
"sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
"sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
"sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
"sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
"sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
"sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
"sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
"sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
"sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
"sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
"sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
"sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
"sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
"sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
"sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
"sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
"sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
"sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
"sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
"sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
"sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
"sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
"sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
"sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
"sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
"sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
"sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
"sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
"sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
"sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
"sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
"sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
"sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
"sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
"sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
"sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
"sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
"sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
"sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
"sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
"sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
"sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
"sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
"sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
"sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
"sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
"sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
"sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
"sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
"sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
"sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
"sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
"sha256:00f3a6f88fd5f4357844dd91a1abac5f466c6799f1b7f1da2df6665253845b11",
"sha256:024684e0c5cfa121c22140d3a0898a3a9b2ea0f0fd2c229b6658af4bdf1155e5",
"sha256:03370ec37fe562238d385e2c53089076dee53aabf8325cab964fdb04a9130fa0",
"sha256:0aa4cce579512c33373ca4c5e23c21e40c1aa1a33533a75e51b654834fd0e4f2",
"sha256:1057356b808d149bc14eb8f37bb89129f237df488661c1e0fc0376ca90e1d2c3",
"sha256:11d62c97ceff9bab94b6b29c010ea5fb6831743459bb759c917f49ba75601cd0",
"sha256:1254a79f8a67a3908de725caf59eae62d86738f6387b0a34b32e02abd6ae73db",
"sha256:1bfb791a8fcdbf55d1d41b8be940393687bec0e9b12733f0796668086d1a23ff",
"sha256:28cf04a1a38e961d4a764d2940af9b941b66263ed5584392ef875ee9c1e360a3",
"sha256:2b9c2341d96926b0d0e132e5c49ef85eb53fa92ae1c3a70f9072f3db0d32bc07",
"sha256:2d10659e6e5c53298e6d718fd126e793285bff904bb71d7239a17218f6a197b7",
"sha256:3af00ee88376022589ceeb8170eb67dacf5f7cd625ea59fa0977d719777d4ae8",
"sha256:3cf816aed8125cfc9e6e5c6c31ff94278320d591bd7970c4a0233bee0d1c8790",
"sha256:4becd16750ca5c2a1b1588269322b2cebd10c07738f336c922b658dbab96a61c",
"sha256:4cd69bca464e892ea4ed544ba6a7850aaff6f8d792f8055a10638db60acbac18",
"sha256:4e97c8fc761ad63909198acc892f34c20f37f3baa2c50a62d5ec5d7f1efc68a1",
"sha256:520461c36727268a989790aef08884347cd41f2d8ae855489ccf40b50321d8d7",
"sha256:53b0410b220766321759f7f9066da67b1d0d4a7f6636a477984cbb1d98483955",
"sha256:56e19fb6e4b8bd07fb20028d03d3bc67bcc0621347fbde64f248e44839771756",
"sha256:5a49ad78543925e1a4196e20c9c54492afa4f1502c2a563f73097e2044c75190",
"sha256:5d52e1173f52020392f593f87a6af2d4055dd800574a5cb0af4ea3878801d307",
"sha256:607224ffae9a0cf0a2f6e14f5f6bce43e83a6fbdaa647891729c103bdd6a5593",
"sha256:612ef8f2795a89ba3a1d4c8c1af84d8453fd53ee611aa5ad460fdd2cab426fc2",
"sha256:615886ee84b6f42f1bdf1852a9669b5fe3b96b6ff27f1a7a330b67ad9911200a",
"sha256:63419db39df8dc5564f6f103102c4665f7e4d9cb64030e98cf7a74eae5d5760d",
"sha256:6467626fa74f96f4d80fc6ec2555799e97fff8f36e0bfc7f67769f83e59cff40",
"sha256:65b3b5f12c6fb5611e79157214f3cd533083f9b058bf2fc8a1c5cc5ee40fdc5a",
"sha256:686565ac77ff94a8965c11829af253d9e2ce3bf0d9225b1d2eb5c4d4666d0dca",
"sha256:6af7f51a6010748fc1bb71917318d953c9673e4ae3f6d285aaf93ef5b2eb11c1",
"sha256:70a198030d26f5e569367f0f04509b63256faa76a22886280eea69a4f535dd40",
"sha256:754a1dd04bff8a509a31146bd8f3a5dc8191a8694d582dd5fb71ff09f0722c22",
"sha256:75da29a0752c8f2395df0115ac1681cefbdd4418676015be8178b733704cbff2",
"sha256:81c29c8741fa07ecec8ec7417c3d8d1e2f18cf5a10a280f4e1c3f8c3590228b2",
"sha256:9093a359a86650a3dbd6532c3e4d21a6f58ba2cb60d0e72db0848115d24c10ba",
"sha256:915ecf7d486df17cc65aeefdb680d5ad4390cc8c857cf8db3fe241ed234f856a",
"sha256:94b181dd2777890139e49a5336bf3a9a3378ce66132c665fe8db4e8b7683cde2",
"sha256:94f2e45b054dd759bed137b6e14ae8625495f7d90ddd23cf62c7a68f72b62656",
"sha256:9af19eb789d674b59a9bee5005779757aab857c40bf9cc313cb01eafac55ce55",
"sha256:9cae837b988f44925d14d048fa6a8c54f197c8b1223fd9ee9c27084f84606143",
"sha256:aa7447bf7c1a15ef24e2b86a277b585dd3f055e8890ac7f97374d170187daa97",
"sha256:b1e22f3ee4d75ca261b6bffbf64f6f178cb194b1be3191065a09f8d98828daa9",
"sha256:b5031d151d6147eac53366d6ec87da84cd4d8c5e80b1d9948a667a7164116e39",
"sha256:b62d1431b4c40cda43cc986f19b8c86b1d2ae8918cfc00f4776fdf070b65c0c4",
"sha256:b71c52d69b91af7d18c13aef1b0cc3baee36b78607c711eb14a52bf3aa7c815e",
"sha256:b7679344f2270840dc5babc9ccbedbc04f7473c1f66d4676bb01680c0db85bcc",
"sha256:bb7c1b029e54e26e01b1d1d912fc21abb65650d16ea9a191d026def4ed0859ed",
"sha256:c2a57755e366e0ac7ebdb3e9207f159c3bf1afed02392ab18453ce81f5ee92ee",
"sha256:cf9ec915857d260511399ab87e1e70fa13d6b2972258f8e620a3959468edfc32",
"sha256:d0d03b9636f1326772e6854459728676354d4c7731dae9902b180e2065ba3da6",
"sha256:d1690c4d37674a5f0cdafbc5ed7e360800afcf06928c2a024c779c046891bf09",
"sha256:d76da27f5e3e9bc40eba6ed7a9e985f57547e98cf20521d91215707f2fb57e0f",
"sha256:d882c2f3345261e898b9f604be76b61c901fbfa4ac32e3f51d5dc1edc89da3cb",
"sha256:d8e5021e770b0a3084c30dda5901d5fce6d4474feaf0ced8f8e5a82702502fbb",
"sha256:dd00d28d1ab5fa7627f5abc957f29a6338a7395b724571a8cbff8fbed83aaa82",
"sha256:e35a298691b9e10e5a5631f8f0ba605b30ebe19208dc8f58b670462f53753641",
"sha256:e4d020ecf3740b7312bacab2cb966bb720fd4d3490562d373b4ad91dd1857c0d",
"sha256:e564d5a771b4015f34166a05ea2165b7e283635c41b1347696117f780084b46d",
"sha256:ea3f2e9eb41f973f73619e88bf7bd950b16b4c2ce73d15f24a11800ce1eaf276",
"sha256:eabdbe04ee0a7e760fa6cd9e799d2b020d098c580ba99107d52e1e5e538b1ecb",
"sha256:f17b9df97c5ecdfb56c5e85b3c9df9831246df698f8581c6e111ac664c7c656e",
"sha256:f386def57742aacc3d864169dfce644a8c396f95aa35b41b69df53f558d56dd0",
"sha256:f6d23a01921b741774f35e924d418a43cf03eca1444f3fdfd7978d35a5aaab8b",
"sha256:fcdf70191f0d1761d190a436db06a46f05af60e1410e1507935f0332280c9268"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.8.0"
"version": "==4.9.0"
},
"markupsafe": {
"hashes": [
@ -553,16 +562,16 @@
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==3.2.0"
},
"outcome": {
"hashes": [
"sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958",
"sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"
"sha256:6f82bd3de45da303cf1f771ecafa1633750a358436a8bb60e06a1ceb745d2672",
"sha256:c4ab89a56575d6d38a05aa16daeaa333109c1f96167aba8901ab18b6b5e0f7f5"
],
"markers": "python_version >= '3.6'",
"version": "==1.1.0"
"markers": "python_version >= '3.7'",
"version": "==1.2.0"
},
"protobuf": {
"hashes": [
@ -676,6 +685,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.14.1"
},
"pygments": {
"hashes": [
"sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb",
"sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2.12.0"
},
"pyopenssl": {
"hashes": [
"sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf",
@ -688,7 +705,7 @@
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
],
"markers": "python_full_version >= '3.6.8'",
"markers": "python_version >= '3.1'",
"version": "==3.0.9"
},
"pysocks": {
@ -707,21 +724,156 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2"
},
"python-dotenv": {
"python-slugify": {
"hashes": [
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
"sha256:272d106cb31ab99b3496ba085e3fea0e9e76dcde967b5e9992500d1f785ce4e1",
"sha256:7b2c274c308b62f4269a9ba701aa69a797e9bca41aeee5b3a9e79e36b6656927"
],
"index": "pypi",
"version": "==0.20.0"
"version": "==6.1.2"
},
"requests": {
"pytz": {
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
],
"version": "==2022.1"
},
"pytz-deprecation-shim": {
"hashes": [
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.27.1"
"version": "==0.1.0.post0"
},
"pyyaml": {
"hashes": [
"sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293",
"sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b",
"sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57",
"sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b",
"sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4",
"sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07",
"sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba",
"sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9",
"sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287",
"sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513",
"sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0",
"sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0",
"sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92",
"sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f",
"sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2",
"sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc",
"sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c",
"sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86",
"sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4",
"sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c",
"sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34",
"sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b",
"sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c",
"sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb",
"sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737",
"sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3",
"sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d",
"sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53",
"sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78",
"sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803",
"sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a",
"sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174",
"sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"
],
"index": "pypi",
"version": "==6.0"
},
"regex": {
"hashes": [
"sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14",
"sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9",
"sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204",
"sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f",
"sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737",
"sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b",
"sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3",
"sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4",
"sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac",
"sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f",
"sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29",
"sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772",
"sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1",
"sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863",
"sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66",
"sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed",
"sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47",
"sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f",
"sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f",
"sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008",
"sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d",
"sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571",
"sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0",
"sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a",
"sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3",
"sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7",
"sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447",
"sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493",
"sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4",
"sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede",
"sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640",
"sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd",
"sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c",
"sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee",
"sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30",
"sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b",
"sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec",
"sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1",
"sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e",
"sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8",
"sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9",
"sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231",
"sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7",
"sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729",
"sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960",
"sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056",
"sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357",
"sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7",
"sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3",
"sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7",
"sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573",
"sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0",
"sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178",
"sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f",
"sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834",
"sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c",
"sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015",
"sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0",
"sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57",
"sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635",
"sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07",
"sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2",
"sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1",
"sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b",
"sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2",
"sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5",
"sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b",
"sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86",
"sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5",
"sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93",
"sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0",
"sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f",
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2022.3.2"
},
"requests": {
"extras": [],
"hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
],
"markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.0"
},
"requests-oauthlib": {
"hashes": [
@ -738,28 +890,36 @@
],
"version": "==0.9.1"
},
"rich": {
"hashes": [
"sha256:4c586de507202505346f3e32d1363eb9ed6932f0c2f63184dea88983ff4971e2",
"sha256:d2bbd99c320a2532ac71ff6a3164867884357da3e3301f0240090c5d2fdac7ec"
],
"markers": "python_version < '4' and python_full_version >= '3.6.3'",
"version": "==12.4.4"
},
"rsa": {
"hashes": [
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4'",
"markers": "python_version < '4' and python_full_version >= '3.6.0'",
"version": "==4.8"
},
"s3transfer": {
"hashes": [
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
"sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"
"sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
"sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
"markers": "python_version >= '3.6'",
"version": "==0.5.2"
"markers": "python_version >= '3.7'",
"version": "==0.6.0"
},
"selenium": {
"hashes": [
"sha256:866b6dd6c459210662bff922ee7c33162d21920fbf6811e8e5a52be3866a687f"
"sha256:ba5b2633f43cf6fe9d308fa4a6996e00a101ab9cb1aad6fd91ae1f3dbe57f56f"
],
"index": "pypi",
"version": "==4.1.5"
"version": "==4.2.0"
},
"six": {
"hashes": [
@ -797,7 +957,7 @@
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==2.3.2.post1"
},
"telethon": {
@ -808,17 +968,35 @@
"index": "pypi",
"version": "==1.24.0"
},
"text-unidecode": {
"hashes": [
"sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8",
"sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"
],
"version": "==1.3"
},
"tiktok-downloader": {
"git": "https://github.com/msramalho/tiktok-downloader",
"ref": "7bd8bb331d00ebdc317b8cc9c28ecbd83c89e03c"
"hashes": [
"sha256:48fe204df962893a60360a20b13da133bc22bdbfec87c3cd3a9157f138785242"
],
"index": "pypi",
"version": "==0.3.3"
},
"tqdm": {
"hashes": [
"sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d",
"sha256:74a2cdefe14d11442cedf3ba4e21a3b84ff9a2dbdc6cfae2c34addb2a14a5ea6"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==4.64.0"
},
"trio": {
"hashes": [
"sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070",
"sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a"
"sha256:4dc0bf9d5cc78767fc4516325b6d80cc0968705a31d0eec2ecd7cdda466265b0",
"sha256:523f39b7b69eef73501cebfe1aafd400a9aad5b03543a0eded52952488ff1c13"
],
"markers": "python_version >= '3.7'",
"version": "==0.20.0"
"version": "==0.21.0"
},
"trio-websocket": {
"hashes": [
@ -828,15 +1006,35 @@
"markers": "python_version >= '3.5'",
"version": "==0.9.2"
},
"tzdata": {
"hashes": [
"sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9",
"sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2022.1"
},
"tzlocal": {
"hashes": [
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==4.2"
},
"uritemplate": {
"hashes": [
"sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
"sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==4.1.1"
},
"urllib3": {
"extras": [
"secure",
"socks"
],
"hashes": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
@ -844,6 +1042,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.9"
},
"vk-api": {
"hashes": [
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc",
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3"
],
"index": "pypi",
"version": "==11.9.8"
},
"websockets": {
"hashes": [
"sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",

151
README.md
Wyświetl plik

@ -1,50 +1,116 @@
# auto-archiver
This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
Python script to automatically archive social media posts, videos, and images from a Google Sheets document. Uses different archivers depending on the platform, and can save content to local storage, S3 bucket (Digital Ocean Spaces, AWS, ...), and Google Drive. The Google Sheets where the links come from is updated with information about the archived content. It can be run manually or on an automated basis.
## Setup
If you are using `pipenv` (recommended), `pipenv install` is sufficient to install Python prerequisites.
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
You also need:
1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
2. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
### Configuration file
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
[firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
<details><summary><code>python auto_archive.py --help</code></summary>
[fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```
DO_SPACES_REGION=
DO_BUCKET=
DO_SPACES_KEY=
DO_SPACES_SECRET=
INTERNET_ARCHIVE_S3_KEY=
INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=
FACEBOOK_COOKIE=
GD_ROOT_FOLDER_ID=
```js
usage: auto_archive.py [-h] [--config CONFIG] [--storage {s3,local,gd}] [--sheet SHEET] [--header HEADER] [--check-if-exists] [--save-logs] [--s3-private] [--col-url URL] [--col-status STATUS] [--col-folder FOLDER]
[--col-archive ARCHIVE] [--col-date DATE] [--col-thumbnail THUMBNAIL] [--col-thumbnail_index THUMBNAIL_INDEX] [--col-timestamp TIMESTAMP] [--col-title TITLE] [--col-duration DURATION]
[--col-screenshot SCREENSHOT] [--col-hash HASH]
Automatically archive social media posts, videos, and images from a Google Sheets document.
The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options
are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work.
optional arguments:
-h, --help show this help message and exit
--config CONFIG the filename of the YAML configuration file (defaults to 'config.yaml')
--storage {s3,local,gd}
which storage to use [execution.storage in config.yaml]
--sheet SHEET the name of the google sheets document [execution.sheet in config.yaml]
--header HEADER 1-based index for the header row [execution.header in config.yaml]
--check-if-exists when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]
--save-logs creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]
--s3-private Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]
--col-url URL the name of the column to READ url FROM (default='link')
--col-status STATUS the name of the column to FILL WITH status (default='archive status')
--col-folder FOLDER the name of the column to READ folder FROM (default='destination folder')
--col-archive ARCHIVE
the name of the column to FILL WITH archive (default='archive location')
--col-date DATE the name of the column to FILL WITH date (default='archive date')
--col-thumbnail THUMBNAIL
the name of the column to FILL WITH thumbnail (default='thumbnail')
--col-thumbnail_index THUMBNAIL_INDEX
the name of the column to FILL WITH thumbnail_index (default='thumbnail index')
--col-timestamp TIMESTAMP
the name of the column to FILL WITH timestamp (default='upload timestamp')
--col-title TITLE the name of the column to FILL WITH title (default='upload title')
--col-duration DURATION
the name of the column to FILL WITH duration (default='duration')
--col-screenshot SCREENSHOT
the name of the column to FILL WITH screenshot (default='screenshot')
--col-hash HASH the name of the column to FILL WITH hash (default='hash')
```
`.example.env` is an example of this file
</details><br/>
#### Example invocations
All the configurations can be specified in the YAML config file, but sometimes it is useful to override only some of those like the sheet that we are running the archival on, here are some examples (possibly prepended by `pipenv run`):
```bash
# all the configurations come from config.yaml
python auto_archive.py
# all the configurations come from config.yaml,
# checks if URL is not archived twice and saves logs to logs/ folder
python auto_archive.py --check-if-exists --save_logs
# all the configurations come from my_config.yaml
python auto_archive.py --config my_config.yaml
# reads the configurations but saves archived content to google drive instead
python auto_archive.py --config my_config.yaml --storage gd
# uses the configurations but for another google docs sheet
# with a header on row 2 and with some different column names
python auto_archive.py --config my_config.yaml --sheet="use it on another sheets doc" --header=2 --col-link="put urls here"
# all the configurations come from config.yaml and specifies that s3 files should be private
python auto_archive.py --s3-private
```
### Extra notes on configuration
#### Google Drive
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
#### Telethon (Telegrams API Library)
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
## Running
There is just one necessary command line flag, `--sheet name` which the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. This sheet must also have specific columns in the first row:
* `Media URL` (required): the location of the media to be archived. This is the only column that should be supplied with data initially
The `--sheet name` property (or `execution.sheet` in the YAML file) is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` row (see `COLUMN_NAMES` in [gworksheet.py](utils/gworksheet.py)), only the `link` and `status` columns are mandatory:
* `Link` (required): the location of the media to be archived. This is the only column that should be supplied with data initially
* `Archive status` (required): the status of the auto archiver script. Any row with text in this column will be skipped automatically.
* `Archive location` (required): the location of the archived version. For files that were not able to be auto archived, this can be manually updated.
* `Destination folder`: (optional) by default files are saved to a folder called `name-of-sheets-document/name-of-sheets-tab/` using this option you can organize documents into folder from the sheet.
* `Archive location`: the location of the archived version. For files that were not able to be auto archived, this can be manually updated.
* `Archive date`: the date that the auto archiver script ran for this file
* `Upload timestamp`: the timestamp extracted from the video. (For YouTube, this unfortunately does not currently include the time)
* `Duration`: the duration of the video
* `Upload title`: the "title" of the video from the original source
* `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible)
* `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content
* `Hash`: a hash of the first video or image found
* `Screenshot`: a screenshot taken with from a browser view of opening the page
* in case of videos
* `Duration`: duration in seconds
* `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible)
* `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content
For example, for use with this spreadsheet:
@ -74,7 +140,7 @@ With this configuration, the archiver should archive and store all media added t
# auto_auto_archiver
To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) This script takes one command line argument, with `--sheet`, the name of the sheet. It must be shared with the same service account.
To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) You can simply use your default config as for `auto_archiver.py` but use `--sheet` to specify the name of the sheet that lists the names of sheets to archive.It must be shared with the same service account.
![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png)
@ -86,37 +152,24 @@ Code is split into functional concepts:
1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
### Current Archivers
Archivers are tested in a meaningful order with Wayback Machine being the failsafe, that can easily be changed in the code.
```mermaid
graph TD
A(Archiver) -->|parent of| B(TelegramArchiver)
A -->|parent of| C(TikTokArchiver)
A(Archiver) -->|parent of| B(TelethonArchiver)
A -->|parent of| C(TiktokArchiver)
A -->|parent of| D(YoutubeDLArchiver)
A -->|parent of| E(WaybackArchiver)
A -->|parent of| E(TelegramArchiver)
A -->|parent of| F(TwitterArchiver)
A -->|parent of| G(VkArchiver)
A -->|parent of| H(WaybackArchiver)
```
### Current Storages
```mermaid
graph TD
A(BaseStorage) -->|parent of| B(S3Storage)
A(BaseStorage) -->|parent of| C(GoogleDriveStorage)
A(BaseStorage) -->|parent of| C(LocalStorage)
A(BaseStorage) -->|parent of| D(GoogleDriveStorage)
```
## Saving into Subfolders
You can have a column in the spreadsheet for the argument `--col-subfolder` that is passed to the storage and can specify a subfolder to put the archived link into.
## Google Drive
To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
```bash
python auto_archive.py --sheet 'Sheet Name' --storage='gd'
```
## Telethon (Telegrams API Library)
Put your `anon.session` in the root, so that it doesn't stall and ask for authentication

Wyświetl plik

@ -1,8 +1,9 @@
# we need to explicitly expose the available imports here
from .base_archiver import *
from .telegram_archiver import *
from .telethon_archiver import *
from .tiktok_archiver import *
from .wayback_archiver import *
from .youtubedl_archiver import *
from .twitter_archiver import *
from .base_archiver import Archiver, ArchiveResult
from .telegram_archiver import TelegramArchiver
from .telethon_archiver import TelethonArchiver
from .tiktok_archiver import TiktokArchiver
from .wayback_archiver import WaybackArchiver
from .youtubedl_archiver import YoutubeDLArchiver
from .twitter_archiver import TwitterArchiver
from .vk_archiver import VkArchiver

Wyświetl plik

@ -1,23 +1,18 @@
import os
import ffmpeg
import datetime
import shutil
import os, datetime, shutil, hashlib, time, requests, re, mimetypes
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
import hashlib
import time
import requests
from random import randrange
import ffmpeg
from loguru import logger
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from slugify import slugify
from storages import Storage
from utils import mkdir_if_not_exists
from selenium.webdriver.common.by import By
from loguru import logger
from selenium.common.exceptions import TimeoutException
@dataclass
class ArchiveResult:
@ -34,6 +29,7 @@ class ArchiveResult:
class Archiver(ABC):
name = "default"
retry_regex = r"retrying at (\d+)$"
def __init__(self, storage: Storage, driver):
self.storage = storage
@ -42,30 +38,39 @@ class Archiver(ABC):
def __str__(self):
return self.__class__.__name__
def __repr__(self):
return self.__str__()
@abstractmethod
def download(self, url, check_if_exists=False): pass
def get_netloc(self, url):
return urlparse(url).netloc
def get_html_key(self, url):
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
"""
Generates an index.html page where each @urls_info is displayed
"""
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
<body>
<h2>Archived media from {self.name}</h2>
<h3><a href="{url}">{url}</a></h3><ul>'''
for url_info in urls_info:
page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''
mime_global = self._guess_file_type(url_info["key"])
preview = ""
if mime_global == "image":
preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
elif mime_global == "video":
preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
page += f"</body></html>"
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = 'tmp/' + page_key
page_key = self.get_html_key(url)
page_filename = os.path.join(Storage.TMP_FOLDER, page_key)
with open(page_filename, "w") as f:
f.write(page)
@ -78,8 +83,23 @@ class Archiver(ABC):
page_cdn = self.storage.get_cdn_url(page_key)
return (page_cdn, page_hash, thumbnail)
def _guess_file_type(self, path: str):
"""
Receives a URL or filename and returns global mimetype like 'image' or 'video'
see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
"""
mime = mimetypes.guess_type(path)[0]
if mime is not None:
return mime.split("/")[0]
return ""
# eg images in a tweet save to cloud storage
def generate_media_page(self, urls, url, object):
"""
For a list of media urls, fetch them, upload them
and call self.generate_media_page_html with them
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
@ -87,26 +107,16 @@ class Archiver(ABC):
thumbnail = None
uploaded_media = []
for media_url in urls:
path = urlparse(media_url).path
key = self.get_key(path.replace("/", "_"))
if '.' not in path:
key += '.jpg'
key = self._get_key_from_url(media_url, ".jpg")
filename = 'tmp/' + key
filename = os.path.join(Storage.TMP_FOLDER, key)
# eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f:
f.write(d.content)
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
self.storage.upload(filename, key)
hash = self.get_hash(filename)
# eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
cdn_url = self.storage.get_cdn_url(key)
if thumbnail is None:
@ -130,21 +140,36 @@ class Archiver(ABC):
return f'{self.name}_{_id}{extension}'
def get_hash(self, filename):
f = open(filename, "rb")
bytes = f.read() # read entire file as bytes
def get_html_key(self, url):
return self._get_key_from_url(url, ".html")
# TODO: customizable hash
hash = hashlib.sha256(bytes)
# option to use SHA3_512 instead
# hash = hashlib.sha3_512(bytes)
f.close()
def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
"""
Receives a URL and returns a slugified version of the URL path
if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
if @append_date is true, the key adds a timestamp after the URL slug and before the extension
"""
slug = slugify(urlparse(url).path)
if append_datetime:
slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
if with_extension is not None:
if "." not in slug:
slug += with_extension
return self.get_key(slug)
def get_hash(self, filename):
with open(filename, "rb") as f:
bytes = f.read() # read entire file as bytes
# TODO: customizable hash
hash = hashlib.sha256(bytes)
# option to use SHA3_512 instead
# hash = hashlib.sha3_512(bytes)
return hash.hexdigest()
def get_screenshot(self, url):
key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key
logger.debug(f"getting screenshot for {url=}")
key = self._get_key_from_url(url, ".png", append_datetime=True)
filename = os.path.join(Storage.TMP_FOLDER, key)
# Accept cookies popup dismiss for ytdlp video
if 'facebook.com' in url:
@ -154,7 +179,7 @@ class Archiver(ABC):
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
foo.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except:
logger.warning(f'Failed on fb accept cookies for url {url}')
@ -200,12 +225,11 @@ class Archiver(ABC):
key = key_folder + fname
self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)
cdn_urls.append(cdn_url)
if len(cdn_urls) == 0:
return ('None', 'None')
return ('', '')
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
@ -230,3 +254,37 @@ class Archiver(ABC):
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)
def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
"""
sets state to retry in random between (min_seconds, max_seconds)
"""
now = datetime.datetime.now().timestamp()
retry_at = int(now + randrange(min_seconds, max_seconds))
logger.debug(f"signaling {retry_at=}")
return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
def is_retry(status):
return re.search(Archiver.retry_regex, status) is not None
def should_retry_from_status(status):
"""
checks status against message in signal_retry_in
returns true if enough time has elapsed, false otherwise
"""
match = re.search(Archiver.retry_regex, status)
if match:
retry_at = int(match.group(1))
now = datetime.datetime.now().timestamp()
should_retry = now >= retry_at
logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")
return should_retry
return False
def remove_retry(status):
"""
transforms the status from retry into something else
"""
new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
logger.debug(f"removing retry message at {status=}, got {new_status=}")
return new_status

Wyświetl plik

@ -1,11 +1,11 @@
import os
import requests
import os, requests, re
import html
from bs4 import BeautifulSoup
from loguru import logger
import re
import html
from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class TelegramArchiver(Archiver):
@ -52,8 +52,7 @@ class TelegramArchiver(Archiver):
video_id = video_url.split('/')[-1].split('?')[0]
key = self.get_key(video_id)
filename = 'tmp/' + key
cdn_url = self.storage.get_cdn_url(key)
filename = os.path.join(Storage.TMP_FOLDER, key)
if check_if_exists and self.storage.exists(key):
status = 'already archived'
@ -84,5 +83,6 @@ class TelegramArchiver(Archiver):
filename, key, duration=duration)
os.remove(filename)
cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)

Wyświetl plik

@ -1,28 +1,25 @@
import os
import re
import html
from dataclasses import dataclass
from loguru import logger
import os, re
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
import html
from loguru import logger
from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError
@dataclass
class TelegramConfig:
api_id: str
api_hash: str
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import TelethonConfig
from utils import getattr_or
class TelethonArchiver(Archiver):
name = "telethon"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(.+)")
def __init__(self, storage: Storage, driver, config: TelegramConfig):
def __init__(self, storage: Storage, driver, config: TelethonConfig):
super().__init__(storage, driver)
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
if config:
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
self.bot_token = config.bot_token
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
"""
@ -31,8 +28,8 @@ class TelethonArchiver(Archiver):
of `max_amp` both ways
Returns a list of [post] where each post has media and is in the same grouped_id
"""
if original_post.grouped_id is None:
return [original_post] if original_post.media is not None else []
if getattr_or(original_post, "grouped_id") is None:
return [original_post] if getattr_or(original_post, "media") else []
search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
posts = self.client.get_messages(chat, ids=search_ids)
@ -43,16 +40,19 @@ class TelethonArchiver(Archiver):
return media
def download(self, url, check_if_exists=False):
if not hasattr(self, "client"):
logger.error('Missing Telethon config')
return False
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
if not len(matches):
return False
status = "success"
screenshot = self.get_screenshot(url)
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
with self.client.start():
with self.client.start(bot_token=self.bot_token):
matches = list(matches[0])
chat, post_id = matches[1], matches[2]
@ -61,16 +61,20 @@ class TelethonArchiver(Archiver):
try:
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
logger.error(f'Could not fetch telegram {url} possibly it\'s private: {e}')
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
return False
except ChannelInvalidError as e:
# TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819
logger.error(f'Could not fetch telegram {url} possibly it\'s private or not displayable in : {e}')
logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}")
return False
media_posts = self._get_media_posts_in_group(chat, post)
if post is None: return False
if len(media_posts) > 1:
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
screenshot = self.get_screenshot(url)
if len(media_posts) > 0:
key = self.get_html_key(url)
if check_if_exists and self.storage.exists(key):
@ -82,30 +86,22 @@ class TelethonArchiver(Archiver):
group_id = post.grouped_id if post.grouped_id is not None else post.id
uploaded_media = []
message = post.message
for mp in media_posts:
for i, mp in enumerate(media_posts):
if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1]
filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id))
filename = self.client.download_media(mp.media, filename_dest)
key = filename.split(Storage.TMP_FOLDER)[1]
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
if i == 0:
key_thumb, thumb_index = self.get_thumbnails(filename, key)
os.remove(filename)
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'tmp/{key}')
key = filename.split('tmp/')[1].replace(" ", "")
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
key_thumb, thumb_index = self.get_thumbnails(filename, key)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot)

Wyświetl plik

@ -3,6 +3,7 @@ import tiktok_downloader
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
from storages import Storage
class TiktokArchiver(Archiver):
@ -17,8 +18,8 @@ class TiktokArchiver(Archiver):
try:
info = tiktok_downloader.info_post(url)
key = self.get_key(f'{info.id}.mp4')
cdn_url = self.storage.get_cdn_url(key)
filename = 'tmp/' + key
filename = os.path.join(Storage.TMP_FOLDER, key)
logger.info(f'found video {key=}')
if check_if_exists and self.storage.exists(key):
status = 'already archived'
@ -27,13 +28,15 @@ class TiktokArchiver(Archiver):
if len(media) <= 0:
if status == 'already archived':
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
else:
return ArchiveResult(status='Could not download media')
logger.info(f'downloading video {key=}')
media[0].download(filename)
if status != 'already archived':
logger.info(f'uploading video {key=}')
self.storage.upload(filename, key)
try:
@ -49,10 +52,12 @@ class TiktokArchiver(Archiver):
try: os.remove(filename)
except FileNotFoundError:
logger.info(f'tmp file not found thus not deleted {filename}')
cdn_url = self.storage.get_cdn_url(key)
timestamp = info.create.isoformat() if hasattr(info, "create") else None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
hash=hash, screenshot=screenshot)
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
timestamp=timestamp, hash=hash, screenshot=screenshot)
except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL'

Wyświetl plik

@ -1,6 +1,8 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger
import html
from urllib.parse import urlparse
from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from .base_archiver import Archiver, ArchiveResult
@ -11,6 +13,7 @@ class TwitterArchiver(Archiver):
def download(self, url, check_if_exists=False):
if 'twitter.com' != self.get_netloc(url):
logger.debug(f'{url=} is not from twitter')
return False
tweet_id = urlparse(url).path.split('/')
@ -18,6 +21,7 @@ class TwitterArchiver(Archiver):
i = tweet_id.index('status')
tweet_id = tweet_id[i + 1]
else:
logger.debug(f'{url=} does not contain "status"')
return False
scr = TwitterTweetScraper(tweet_id)
@ -29,8 +33,10 @@ class TwitterArchiver(Archiver):
return False
if tweet.media is None:
logger.trace(f'No media found')
return False
logger.debug(f'No media found, archiving tweet text only')
screenshot = self.get_screenshot(url)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
urls = []
@ -50,4 +56,4 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)

Wyświetl plik

@ -0,0 +1,89 @@
import re, json, requests
import vk_api, dateparser
from bs4 import BeautifulSoup
from loguru import logger
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import VkConfig
class VkArchiver(Archiver):
""""
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
"""
name = "vk"
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
onclick_pattern = re.compile(r"({.*})")
def __init__(self, storage: Storage, driver, config: VkConfig):
super().__init__(storage, driver)
if config != None:
self.vk_session = vk_api.VkApi(config.username, config.password)
self.vk_session.auth(token_only=True)
def download(self, url, check_if_exists=False):
# detect URLs that this archiver can handle
_id, method = None, None
if has_wall := self.wall_pattern.search(url):
_id = has_wall[0]
method = self.archive_wall
elif has_photo := self.photo_pattern.search(url):
_id = has_photo[0]
method = self.archive_photo
else: return False
logger.info(f"found valid {_id=} from {url=}")
proper_url = f'https://vk.com/{_id}'
# if check if exists will not download again
key = self.get_html_key(proper_url)
if check_if_exists and self.storage.exists(key):
screenshot = self.get_screenshot(proper_url)
cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
try:
return method(proper_url, _id)
except Exception as e:
logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}")
return False
def archive_photo(self, photo_url, photo_id):
headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version}
req = requests.get("https://api.vk.com/method/photos.getById", headers)
res = req.json()["response"][0]
title = res["text"][:200] # more on the page
img_url = res["orig_photo"]["url"]
time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res)
screenshot = self.get_screenshot(photo_url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
def archive_wall(self, wall_url, wall_id):
headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version}
req = requests.get("https://api.vk.com/method/wall.getById", headers)
res = req.json()["response"]
wall = res["items"][0]
img_urls = []
if "attachments" in wall:
for a in wall["attachments"]:
attachment = a[a["type"]]
if "thumb" in attachment:
attachment = attachment["thumb"]
if "sizes" in attachment:
try: img_urls.append(attachment["sizes"][-1]["url"])
except Exception as e:
logger.warning(f"could not get image from attachment: {e}")
title = wall["text"][:200] # more on the page
time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res)
screenshot = self.get_screenshot(wall_url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)

Wyświetl plik

@ -1,81 +1,88 @@
import time, requests, os
import time, requests
from loguru import logger
from bs4 import BeautifulSoup
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from loguru import logger
from configs import WaybackConfig
class WaybackArchiver(Archiver):
"""
This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}"
but that might not be desirable since the webpage might have been archived a long time ago and thus have changed
"""
name = "wayback"
def __init__(self, storage: Storage, driver):
def __init__(self, storage: Storage, driver, config: WaybackConfig):
super(WaybackArchiver, self).__init__(storage, driver)
self.config = config
self.seen_urls = {}
def download(self, url, check_if_exists=False):
if check_if_exists and url in self.seen_urls:
return self.seen_urls[url]
if self.config is None:
logger.error('Missing Wayback config')
return False
if check_if_exists:
if url in self.seen_urls: return self.seen_urls[url]
screenshot = self.get_screenshot(url)
logger.debug(f"POSTing {url=} to web.archive.org")
ia_headers = {
"Accept": "application/json",
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
"Authorization": f"LOW {self.config.key}:{self.config.secret}"
}
r = requests.post(
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}")
return ArchiveResult(status="Internet archive failed")
return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
if 'job_id' not in r.json() and 'message' in r.json():
logger.warning(f"Internet archive failed json \n {r.json()}")
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
return self.custom_retry(r.json(), screenshot=screenshot)
job_id = r.json()['job_id']
status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
logger.debug(f"GETting status for {job_id=} on {url=}")
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
retries = 0
# TODO: make the job queue parallel -> consider propagation of results back to sheet though
# wait 90-120 seconds for the archive job to finish
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
time.sleep(3)
try:
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
except:
time.sleep(1)
retries += 1
if status_r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
return self.custom_retry(status_json, screenshot=screenshot)
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
try:
r = requests.get(archive_url)
parsed = BeautifulSoup(r.content, 'html.parser')
req = requests.get(archive_url)
parsed = BeautifulSoup(req.content, 'html.parser')
title = parsed.find_all('title')[0].text
if title == 'Wayback Machine':
title = 'Could not get title'
except:
title = "Could not get title"
screenshot = self.get_screenshot(url)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
self.seen_urls[url] = result
return result
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
return self.seen_urls[url]
def custom_retry(self, json_data, **kwargs):
logger.warning(f"Internet archive failed json \n {json_data}")
if "please try again" in str(json_data).lower():
return self.signal_retry_in(**kwargs)
if "this host has been already captured" in str(json_data).lower():
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)

Wyświetl plik

@ -1,6 +1,6 @@
import os
import datetime
import os, datetime
import yt_dlp
from loguru import logger
@ -10,7 +10,7 @@ from storages import Storage
class YoutubeDLArchiver(Archiver):
name = "youtube_dl"
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
def __init__(self, storage: Storage, driver, fb_cookie):
super().__init__(storage, driver)
@ -18,7 +18,7 @@ class YoutubeDLArchiver(Archiver):
def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'www.facebook.com']:
if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.fb_cookie
@ -106,11 +106,11 @@ class YoutubeDLArchiver(Archiver):
os.remove(filename)
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
if 'timestamp' in info else \
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
if 'upload_date' in info and info['upload_date'] is not None else \
None
timestamp = None
if 'timestamp' in info and info['timestamp'] is not None:
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat()
elif 'upload_date' in info and info['upload_date'] is not None:
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)

Wyświetl plik

@ -1,30 +1,17 @@
import os
import datetime
import argparse
import requests
import shutil
import gspread
import os, datetime, shutil, traceback, random
from loguru import logger
from dotenv import load_dotenv
from selenium import webdriver
import traceback
from slugify import slugify
import archivers
from storages import S3Storage, S3Config
from storages.gd_storage import GDConfig, GDStorage
from utils import GWorksheet, mkdir_if_not_exists
import sys
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
from utils import GWorksheet, mkdir_if_not_exists, expand_url
from configs import Config
from storages import Storage
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
logger.add("logs/3success.log", level="SUCCESS")
logger.add("logs/4warning.log", level="WARNING")
logger.add("logs/5error.log", level="ERROR")
load_dotenv()
random.seed()
def update_sheet(gw, row, result: archivers.ArchiveResult):
def update_sheet(gw, row, result: ArchiveResult):
cell_updates = []
row_values = gw.get_row(row)
@ -37,8 +24,7 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
batch_if_valid('archive', result.cdn_url)
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('thumbnail', result.thumbnail,
f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail_index', result.thumbnail_index)
batch_if_valid('title', result.title)
batch_if_valid('duration', result.duration, str(result.duration))
@ -58,169 +44,107 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
gw.batch_set_cell(cell_updates)
def expand_url(url):
# expand short URL links
if 'https://t.co/' in url:
try:
r = requests.get(url)
url = r.url
except:
logger.error(f'Failed to expand url {url}')
return url
def missing_required_columns(gw: GWorksheet):
missing = False
for required_col in ['url', 'status']:
if not gw.col_exists(required_col):
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
missing = True
return missing
def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
s3_config = S3Config(
bucket=os.getenv('DO_BUCKET'),
region=os.getenv('DO_SPACES_REGION'),
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET')
)
gd_config = GDConfig(
root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'),
)
telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH')
)
def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):
logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}')
gw = GWorksheet(wks, header_row=header, columns=columns)
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
if not gw.col_exists('url'):
logger.info(
f'No "{columns["url"]}" column found, skipping worksheet {wks.title}')
continue
if missing_required_columns(gw): continue
if not gw.col_exists('status'):
logger.info(
f'No "{columns["status"]}" column found, skipping worksheet {wks.title}')
continue
# archives will be in a folder 'doc_name/worksheet_name'
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
s3_client = S3Storage(s3_config)
gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
gd_client = GDStorage(gd_config)
# archives will default to being in a folder 'doc_name/worksheet_name'
default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
c.set_folder(default_folder)
storage = c.get_storage()
# loop through rows in worksheet
for row in range(1 + header, gw.count_rows() + 1):
for row in range(1 + c.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
if url != '' and status in ['', None]:
is_retry = False
if url == '' or status not in ['', None]:
is_retry = Archiver.should_retry_from_status(status)
if not is_retry: continue
# All checks done - archival process starts here
try:
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
subfolder = gw.get_cell_or_default(row, 'subfolder')
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
# make a new driver so each spreadsheet row is idempotent
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
driver = webdriver.Firefox(options=options)
driver.set_window_size(1400, 2000)
# in seconds, telegram screenshots catch which don't come back
driver.set_page_load_timeout(120)
# client
storage_client = None
if storage == "s3":
storage_client = s3_client
elif storage == "gd":
storage_client = gd_client
else:
raise ValueError(f'Cant get storage_client {storage_client}')
storage_client.update_properties(subfolder=subfolder)
c.recreate_webdriver()
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelethonArchiver(storage_client, driver, telegram_config),
archivers.TelegramArchiver(storage_client, driver),
archivers.TiktokArchiver(storage_client, driver),
archivers.YoutubeDLArchiver(storage_client, driver, os.getenv('FACEBOOK_COOKIE')),
archivers.TwitterArchiver(storage_client, driver),
archivers.WaybackArchiver(storage_client, driver)
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TiktokArchiver(storage, c.webdriver),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver),
VkArchiver(storage, c.webdriver, c.vk_config),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
logger.debug(f'Trying {archiver} on {row=}')
try:
result = archiver.download(url, check_if_exists=True)
except KeyboardInterrupt:
logger.warning("caught interrupt")
gw.set_cell(row, 'status', '')
driver.quit()
exit()
result = archiver.download(url, check_if_exists=c.check_if_exists)
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
if result:
# IA is a Success I believe - or do we want to display a logger warning for it?
if result.status in ['success', 'already archived', 'Internet Archive fallback']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
f'{archiver} succeeded on row {row}, url {url}')
success = result.status in ['success', 'already archived']
result.status = f"{archiver.name}: {result.status}"
if success:
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
break
# only 1 retry possible for now
if is_retry and Archiver.is_retry(result.status):
result.status = Archiver.remove_retry(result.status)
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
# wayback has seen this url before so keep existing status
if "wayback: Internet Archive fallback" in result.status:
logger.success(
f'wayback has seen this url before so keep existing status on row {row}')
result.status = result.status.replace(' (duplicate)', '')
result.status = str(result.status) + " (duplicate)"
break
logger.warning(
f'{archiver} did not succeed on {row=}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
# get rid of driver so can reload on next row
driver.quit()
if result:
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
logger.success(f'Finshed worksheet {wks.title}')
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {row=}, {url=}")
gw.set_cell(row, 'status', '')
c.destroy_webdriver()
exit()
except Exception as e:
logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
logger.success(f'Finished worksheet {wks.title}')
@logger.catch
def main():
logger.debug(f'Passed args:{sys.argv}')
parser = argparse.ArgumentParser(
description='Automatically archive social media videos from a Google Sheets document')
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True)
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"})
for k, v in GWorksheet.COLUMN_NAMES.items():
help = f"the name of the column to fill with {k} (defaults={v})"
if k == "subfolder":
help = f"the name of the column to read the {k} from (defaults={v})"
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help)
args = parser.parse_args()
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
logger.info(f'Opening document {args.sheet} for header {args.header} and storage {args.storage}')
mkdir_if_not_exists('tmp')
process_sheet(args.sheet, args.storage, args.header, config_columns)
shutil.rmtree('tmp')
c = Config()
c.parse()
logger.info(f'Opening document {c.sheet} for header {c.header}')
mkdir_if_not_exists(Storage.TMP_FOLDER)
process_sheet(c)
c.destroy_webdriver()
shutil.rmtree(Storage.TMP_FOLDER)
if __name__ == '__main__':

Wyświetl plik

@ -1,29 +1,30 @@
import gspread
import argparse
import shutil
import auto_archive
from loguru import logger
from configs import Config
from storages import Storage
from utils import mkdir_if_not_exists
def main():
parser = argparse.ArgumentParser(
description="Automatically use youtube-dl to download media from a Google Sheet")
parser.add_argument("--sheet", action="store", dest="sheet")
c = Config()
c.parse()
logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
args = parser.parse_args()
logger.info("Opening document " + args.sheet)
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(args.sheet)
gc = c.gsheets_client
sh = gc.open(c.sheet)
wks = sh.get_worksheet(0)
values = wks.get_all_values()
mkdir_if_not_exists(Storage.TMP_FOLDER)
for i in range(11, len(values)):
sheet_name = values[i][0]
c.sheet = values[i][0]
logger.info(f"Processing {c.sheet}")
auto_archive.process_sheet(c)
c.destroy_webdriver()
shutil.rmtree(Storage.TMP_FOLDER)
logger.info("Processing " + sheet_name)
auto_archive.process_sheet(sheet_name)
if __name__ == "__main__":
main()

Wyświetl plik

@ -0,0 +1,5 @@
from .config import Config
from .selenium_config import SeleniumConfig
from .telethon_config import TelethonConfig
from .wayback_config import WaybackConfig
from .vk_config import VkConfig

252
configs/config.py 100644
Wyświetl plik

@ -0,0 +1,252 @@
import argparse, yaml, json
import gspread
from loguru import logger
from selenium import webdriver
from dataclasses import asdict
from selenium.common.exceptions import TimeoutException
from utils import GWorksheet, getattr_or
from .wayback_config import WaybackConfig
from .telethon_config import TelethonConfig
from .selenium_config import SeleniumConfig
from .vk_config import VkConfig
from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
class Config:
"""
Controls the current execution parameters and manages API configurations
Usage:
c = Config() # initializes the argument parser
c.parse() # parses the values and initializes the Services and API clients
# you can then access the Services and APIs like 'c.s3_config'
All the configurations available as cmd line options, when included, will
override the configurations in the config.yaml file.
Configurations are split between:
1. "secrets" containing API keys for generating services - not kept in memory
2. "execution" containing specific execution configurations
"""
AVAILABLE_STORAGES = {"s3", "gd", "local"}
def __init__(self):
self.parser = self.get_argument_parser()
self.folder = ""
def parse(self):
self.args = self.parser.parse_args()
logger.success(f'Command line arguments parsed successfully')
self.config_file = self.args.config
self.read_config_yaml()
logger.info(f'APIs and Services initialized:\n{self}')
def read_config_yaml(self):
with open(self.config_file, "r", encoding="utf-8") as inf:
self.config = yaml.safe_load(inf)
# ----------------------EXECUTION - execution configurations
execution = self.config.get("execution", {})
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER)
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
if self.save_logs:
self.set_log_files()
self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False)
# Column names come from config and can be overwritten by CMD
# in the end all are considered as lower case
config_column_names = execution.get("column_names", {})
self.column_names = {}
for k in GWorksheet.COLUMN_NAMES.keys():
self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
# selenium driver
selenium_configs = execution.get("selenium", {})
self.selenium_config = SeleniumConfig(
timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
)
self.webdriver = "not initialized"
# ---------------------- SECRETS - APIs and service configurations
secrets = self.config.get("secrets", {})
# assert selected storage credentials exist
for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]:
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
# google sheets config
self.gsheets_client = gspread.service_account(
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
)
# facebook config
self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
# s3 config
if "s3" in secrets:
s3 = secrets["s3"]
self.s3_config = S3Config(
bucket=s3["bucket"],
region=s3["region"],
key=s3["key"],
secret=s3["secret"],
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
key_path=s3.get("key_path", S3Config.key_path),
private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private))
)
# GDrive config
if "google_drive" in secrets:
gd = secrets["google_drive"]
self.gd_config = GDConfig(
root_folder_id=gd.get("root_folder_id"),
service_account=gd.get("service_account", GDConfig.service_account)
)
if "local" in secrets:
self.local_config = LocalConfig(
save_to=secrets["local"].get("save_to", LocalConfig.save_to),
)
# wayback machine config
if "wayback" in secrets:
self.wayback_config = WaybackConfig(
key=secrets["wayback"]["key"],
secret=secrets["wayback"]["secret"],
)
else:
self.wayback_config = None
logger.debug(f"'wayback' key not present in the {self.config_file=}")
# telethon config
if "telegram" in secrets:
self.telegram_config = TelethonConfig(
api_id=secrets["telegram"]["api_id"],
api_hash=secrets["telegram"]["api_hash"],
bot_token=secrets["telegram"].get("bot_token", None)
)
else:
self.telegram_config = None
logger.debug(f"'telegram' key not present in the {self.config_file=}")
# vk config
if "vk" in secrets:
self.vk_config = VkConfig(
username=secrets["vk"]["username"],
password=secrets["vk"]["password"]
)
else:
self.vk_config = None
logger.debug(f"'vk' key not present in the {self.config_file=}")
del self.config["secrets"] # delete to prevent leaks
def set_log_files(self):
# called only when config.execution.save_logs=true
logger.add("logs/1trace.log", level="TRACE")
logger.add("logs/2info.log", level="INFO")
logger.add("logs/3success.log", level="SUCCESS")
logger.add("logs/4warning.log", level="WARNING")
logger.add("logs/5error.log", level="ERROR")
def get_argument_parser(self):
"""
Creates the CMD line arguments. 'python auto_archive.py --help'
"""
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]')
parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]')
parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]')
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]')
for k, v in GWorksheet.COLUMN_NAMES.items():
help = f"the name of the column to FILL WITH {k} (default='{v}')"
if k in ["url", "folder"]:
help = f"the name of the column to READ {k} FROM (default='{v}')"
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
return parser
def set_folder(self, folder):
"""
update the folder in each of the storages
"""
self.folder = folder
# s3
if hasattr(self, "s3_config"): self.s3_config.folder = folder
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
# gdrive
if hasattr(self, "gd_config"): self.gd_config.folder = folder
if hasattr(self, "gd_storage"): self.gd_storage.folder = folder
# local
if hasattr(self, "local_config"): self.local_config.folder = folder
if hasattr(self, "local_storage"): self.local_storage.folder = folder
def get_storage(self):
"""
returns the configured type of storage, creating if needed
"""
if self.storage == "s3":
self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config))
return self.s3_storage
elif self.storage == "gd":
self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config))
return self.gd_storage
elif self.storage == "local":
self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config))
return self.local_storage
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
def destroy_webdriver(self):
if self.webdriver is not None and type(self.webdriver) != str:
self.webdriver.quit()
del self.webdriver
def recreate_webdriver(self):
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
try:
new_webdriver = webdriver.Firefox(options=options)
# only destroy if creation is successful
self.destroy_webdriver()
self.webdriver = new_webdriver
self.webdriver.set_window_size(self.selenium_config.window_width,
self.selenium_config.window_height)
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
def __str__(self) -> str:
return json.dumps({
"config_file": self.config_file,
"sheet": self.sheet,
"storage": self.storage,
"header": self.header,
"check_if_exists": self.check_if_exists,
"save_logs": self.save_logs,
"tmp_folder": Storage.TMP_FOLDER,
"selenium_config": asdict(self.selenium_config),
"selenium_webdriver": self.webdriver != None,
"s3_config": hasattr(self, "s3_config"),
"s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
"gd_config": hasattr(self, "gd_config"),
"local_config": hasattr(self, "local_config"),
"wayback_config": self.wayback_config != None,
"telegram_config": self.telegram_config != None,
"vk_config": self.vk_config != None,
"gsheets_client": self.gsheets_client != None,
"column_names": self.column_names,
}, ensure_ascii=False, indent=4)

Wyświetl plik

@ -0,0 +1,8 @@
from dataclasses import dataclass
@dataclass
class SeleniumConfig:
timeout_seconds: int = 120
window_width: int = 1400
window_height: int = 2000

Wyświetl plik

@ -0,0 +1,9 @@
from dataclasses import dataclass
@dataclass
class TelethonConfig:
api_id: str
api_hash: str
bot_token: str

Wyświetl plik

@ -0,0 +1,8 @@
from dataclasses import dataclass
@dataclass
class VkConfig:
username: str
password: str

Wyświetl plik

@ -0,0 +1,8 @@
from dataclasses import dataclass
@dataclass
class WaybackConfig:
key: str
secret: str

Wyświetl plik

@ -0,0 +1,88 @@
---
secrets:
# needed if you use storage=s3
s3:
# contains S3 info on region, bucket, key and secret
region: reg1
bucket: my-bucket
key: "s3 API key"
secret: "s3 API secret"
# use region format like such
endpoint_url: 'https://{region}.digitaloceanspaces.com'
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
# if private:true S3 urls will not be readable online
private: false
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
key_path: random
# needed if you use storage=gd
google_drive:
# local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json
service_account: "service_account.json"
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
# needed if you use storage=local
local:
# local path to save files in
save_to: "./local_archive"
wayback:
# to get credentials visit https://archive.org/account/s3.php
key: your API key
secret: your API secret
telegram:
# to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
api_id: your API key, see
api_hash: your API hash
# optional, but allows access to more content such as large videos, talk to @botfather
bot_token: your bot-token
# vkontakte (vk.com) credentials
vk:
username: "phone number or email"
password: "password"
google_sheets:
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
service_account: "service_account.json"
facebook:
# optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
cookie: ""
execution:
# can be overwritten with CMD --sheet=
sheet: your-sheet-name
# which row of your tabs contains the header, can be overwritten with CMD --header=
header: 1
# which storage to use, can be overwritten with CMD --storage=
storage: s3
# defaults to false, when true will try to avoid duplicate URL archives
check_if_exists: true
# optional configurations for the selenium browser that takes screenshots, these are the defaults
selenium:
# values under 10s might mean screenshots fail to grab screenshot
timeout_seconds: 120
window_width: 1400
window_height: 2000
# local tmp folder to save files before uploading to storage
tmp_folder: tmp/
# puts execution logs into /logs folder, defaults to false
save_logs: true
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
# url and status are the only columns required to be present in the google sheet
column_names:
url: link
status: archive status
archive: archive location
# use this column to override default location data
folder: folder
date: archive date
thumbnail: thumbnail
thumbnail_index: thumbnail index
timestamp: upload timestamp
title: upload title
duration: duration
screenshot: screenshot
hash: hash

Wyświetl plik

@ -1,3 +1,5 @@
# we need to explicitly expose the available imports here
from .base_storage import *
from .s3_storage import *
from .base_storage import Storage
from .local_storage import LocalStorage, LocalConfig
from .s3_storage import S3Config, S3Storage
from .gd_storage import GDConfig, GDStorage

Wyświetl plik

@ -4,6 +4,8 @@ from pathlib import Path
class Storage(ABC):
TMP_FOLDER = "tmp/"
@abstractmethod
def __init__(self, config): pass
@ -20,25 +22,3 @@ class Storage(ABC):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)
def update_properties(self, **kwargs):
"""
method used to update general properties that some children may use
and others not, but that all can call
"""
for k, v in kwargs.items():
if k in self.get_allowed_properties():
setattr(self, k, v)
else:
logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
def get_allowed_properties(self):
"""
child classes should specify which properties they allow to be set
"""
return set(["subfolder"])
def clean_path(self, folder, default="", add_forward_slash=True):
if folder is None or type(folder) != str or len(folder.strip()) == 0:
return default
return str(Path(folder)) + ("/" if add_forward_slash else "")

Wyświetl plik

@ -1,26 +1,26 @@
import os, time
from loguru import logger
from .base_storage import Storage
from dataclasses import dataclass
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.oauth2 import service_account
import time
@dataclass
class GDConfig:
root_folder_id: str
folder: str = "default"
service_account: str = "service_account.json"
class GDStorage(Storage):
DEFAULT_UPLOAD_FOLDER_NAME = "default"
def __init__(self, config: GDConfig):
self.folder = config.folder
self.root_folder_id = config.root_folder_id
SCOPES = ['https://www.googleapis.com/auth/drive']
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
creds = service_account.Credentials.from_service_account_file(
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
self.service = build('drive', 'v3', credentials=creds)
def get_cdn_url(self, key):
@ -28,160 +28,111 @@ class GDStorage(Storage):
only support files saved in a folder for GD
S3 supports folder and all stored in the root
"""
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
filename = key
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
# retry policy on Google Drive
try_again = True
counter = 1
folder_id = None
while try_again:
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
results = self.service.files().list(
q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
for item in items:
logger.debug(f"found folder of {item['name']}")
folder_id = item['id']
try_again = False
if folder_id is None:
logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}')
counter += 1
time.sleep(10)
if counter > 18:
raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes')
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
# happens doing thumbnails
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'index.html'
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
# get id of the sub folder
results = self.service.files().list(
q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
filename = None
for item in items:
folder_id = item['id']
filename = b
if filename is None:
raise ValueError(f'Problem finding sub folder {a}')
full_name = os.path.join(self.folder, key)
parent_id, folder_id = self.root_folder_id, None
path_parts = full_name.split(os.path.sep)
filename = path_parts[-1]
logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
for folder in path_parts[0:-1]:
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
parent_id = folder_id
# get id of file inside folder (or sub folder)
results = self.service.files().list(
q=f"'{folder_id}' in parents and name = '{filename}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
file_id = self._get_id_from_parent_and_name(folder_id, filename)
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
file_id = None
for item in items:
logger.debug(f"found file of {item['name']}")
file_id = item['id']
def exists(self, key):
try:
self.get_cdn_url(key)
return True
except: return False
if file_id is None:
raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
return foo
def exists(self, _key):
# TODO: How to check for google drive, as it accepts different names
return False
def uploadf(self, file, key, **_kwargs):
logger.debug(f"before {self.subfolder=}")
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
filename = key
logger.debug(f"after {self.subfolder=}")
# does folder eg SM0005 exist already inside parent of Files auto-archiver
results = self.service.files().list(
q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ",
spaces='drive',
fields='files(id, name)'
).execute()
items = results.get('files', [])
folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
folder_id_to_upload_to = item['id']
if folder_id_to_upload_to is None:
logger.debug(f'Creating new folder {self.subfolder}')
file_metadata = {
'name': [self.subfolder],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [self.root_folder_id]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
folder_id_to_upload_to = gd_file.get('id')
# check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails
# will always return a and a blank b even if there is nothing to split
# https://stackoverflow.com/a/38149500/26086
a, _, b = filename.partition('/')
if b != '':
# a: 'youtube_dl_sDE-qZdi8p8'
# b: 'out1.jpg'
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
results = self.service.files().list(
q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
sub_folder_id_to_upload_to = None
if len(items) > 1:
logger.error(f'Duplicate folder name of {a} which should never happen')
for item in items:
logger.debug(f"Found existing folder of {item['name']}")
sub_folder_id_to_upload_to = item['id']
if sub_folder_id_to_upload_to is None:
# create new folder
file_metadata = {
'name': [a],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [folder_id_to_upload_to]
}
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
sub_folder_id_to_upload_to = gd_file.get('id')
filename = b
folder_id_to_upload_to = sub_folder_id_to_upload_to
# back to normal control flow
def uploadf(self, file: str, key: str, **_kwargs):
"""
1. for each sub-folder in the path check if exists or create
2. upload file to root_id/other_paths.../filename
"""
full_name = os.path.join(self.folder, key)
parent_id, upload_to = self.root_folder_id, None
path_parts = full_name.split(os.path.sep)
filename = path_parts[-1]
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
for folder in path_parts[0:-1]:
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
if upload_to is None:
upload_to = self._mkdir(folder, parent_id)
parent_id = upload_to
# upload file to gd
logger.debug(f'uploading {filename=} to folder id {upload_to}')
file_metadata = {
'name': [filename],
'parents': [folder_id_to_upload_to]
'parents': [upload_to]
}
media = MediaFileUpload(file, resumable=True)
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
def upload(self, filename: str, key: str, **kwargs):
# GD only requires the filename not a file reader
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
self.uploadf(filename, key, **kwargs)
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
"""
Retrieves the id of a folder or file from its @name and the @parent_id folder
Optionally does multiple @retries and sleeps @sleep_seconds between them
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
If @raise_on_missing will throw error when not found, or returns None
Will remember previous calls to avoid duplication if @use_cache
Returns the id of the file or folder from its name as a string
"""
# cache logic
if use_cache:
self.api_cache = getattr(self, "api_cache", {})
cache_key = f"{parent_id}_{name}_{use_mime_type}"
if cache_key in self.api_cache:
logger.debug(f"cache hit for {cache_key=}")
return self.api_cache[cache_key]
# API logic
debug_header: str = f"[searching {name=} in {parent_id=}]"
query_string = f"'{parent_id}' in parents and name = '{name}' "
if use_mime_type:
query_string += f" and mimeType='application/vnd.google-apps.folder' "
for attempt in range(retries):
results = self.service.files().list(
q=query_string,
spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)'
).execute()
items = results.get('files', [])
if len(items) > 0:
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
_id = items[-1]['id']
if use_cache: self.api_cache[cache_key] = _id
return _id
else:
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
if attempt < retries - 1:
logger.debug(f'sleeping for {sleep_seconds} second(s)')
time.sleep(sleep_seconds)
if raise_on_missing:
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
return None
def _mkdir(self, name: str, parent_id: str):
"""
Creates a new GDrive folder @name inside folder @parent_id
Returns id of the created folder
"""
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
file_metadata = {
'name': [name],
'mimeType': 'application/vnd.google-apps.folder',
'parents': [parent_id]
}
gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
return gd_folder.get('id')

Wyświetl plik

@ -0,0 +1,31 @@
import os
from dataclasses import dataclass
from .base_storage import Storage
from utils import mkdir_if_not_exists
@dataclass
class LocalConfig:
folder: str = ""
save_to: str = "./"
class LocalStorage(Storage):
def __init__(self, config:LocalConfig):
self.folder = config.folder
self.save_to = config.save_to
mkdir_if_not_exists(self.save_to)
def get_cdn_url(self, key):
full_path = os.path.join(self.save_to, self.folder, key)
mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
return os.path.abspath(full_path)
def exists(self, key):
return os.path.isfile(self.get_cdn_url(key))
def uploadf(self, file, key, **kwargs):
path = self.get_cdn_url(key)
with open(path, "wb") as outf:
outf.write(file.read())

Wyświetl plik

@ -1,5 +1,9 @@
import uuid, os
from dataclasses import dataclass
import boto3
from botocore.errorfactory import ClientError
from .base_storage import Storage
from dataclasses import dataclass
from loguru import logger
@ -12,30 +16,47 @@ class S3Config:
key: str
secret: str
folder: str = ""
endpoint_url: str = "https://{region}.digitaloceanspaces.com"
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
private: bool = False
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
class S3Storage(Storage):
def __init__(self, config: S3Config):
self.bucket = config.bucket
self.region = config.region
self.folder = self.clean_path(config.folder)
self.folder = config.folder
self.private = config.private
self.cdn_url = config.cdn_url
self.key_path = config.key_path
self.key_dict = {}
self.s3 = boto3.client(
's3',
region_name=self.region,
endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
region_name=config.region,
endpoint_url=config.endpoint_url.format(region=config.region),
aws_access_key_id=config.key,
aws_secret_access_key=config.secret
)
def _get_path(self, key):
return self.folder + self.clean_path(self.subfolder) + key
"""
Depends on the self.key_path configuration:
* random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID
* default -> defaults to self.folder/key
"""
# defaults to /key
final_key = key
if self.key_path == "random":
if key not in self.key_dict:
ext = os.path.splitext(key)[1]
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
final_key = self.key_dict[key]
return os.path.join(self.folder, final_key)
def get_cdn_url(self, key):
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
def exists(self, key):
try:
@ -45,7 +66,6 @@ class S3Storage(Storage):
return False
def uploadf(self, file, key, **kwargs):
logger.debug(f'[S3 storage] uploading {file=}, {key=}')
if self.private:
extra_args = kwargs.get("extra_args", {})
else:

Wyświetl plik

@ -10,10 +10,10 @@ class GWorksheet:
"""
COLUMN_NAMES = {
'url': 'link',
'subfolder': 'sub folder',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'status': 'archive status',
'thumbnail': 'thumbnail',
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
@ -25,9 +25,12 @@ class GWorksheet:
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
self.wks = worksheet
self.values = self.wks.get_values()
self.headers = [v.lower() for v in self.values[header_row - 1]]
self.columns = columns
self.values = self.wks.get_values()
if len(self.values) > 0:
self.headers = [v.lower() for v in self.values[header_row - 1]]
else:
self.headers = []
def _check_col_exists(self, col: str):
if col not in self.columns:
@ -69,12 +72,15 @@ class GWorksheet:
return ''
return row[col_index]
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False):
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
"""
return self.get_cell or default value on error (eg: column is missing)
"""
try:
return self.get_cell(row, col, fresh)
val = self.get_cell(row, col, fresh)
if when_empty_use_default and val.strip() == "":
return default
return val
except:
return default

Wyświetl plik

@ -1,5 +1,28 @@
import os
import os, sys, requests
from loguru import logger
def mkdir_if_not_exists(folder):
if not os.path.exists(folder):
os.mkdir(folder)
os.makedirs(folder)
def expand_url(url):
# expand short URL links
if 'https://t.co/' in url:
try:
r = requests.get(url)
logger.debug(f'Expanded url {url} to {r.url}')
return r.url
except:
logger.error(f'Failed to expand url {url}')
return url
def getattr_or(o: object, prop: str, default = None):
try:
res = getattr(o, prop)
if res is None: raise
return res
except:
return default