kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge pull request #33 from bellingcat/refactor-configs
breaking changes: refactor configs + fixespull/42/head
commit
4c6f3ea688
18
.example.env
18
.example.env
|
@ -1,18 +0,0 @@
|
|||
DO_SPACES_REGION=
|
||||
DO_SPACES_KEY=
|
||||
DO_SPACES_SECRET=
|
||||
DO_BUCKET=
|
||||
INTERNET_ARCHIVE_S3_KEY=
|
||||
INTERNET_ARCHIVE_S3_SECRET=
|
||||
TELEGRAM_API_ID=
|
||||
TELEGRAM_API_HASH=
|
||||
|
||||
FACEBOOK_COOKIE=cookie: datr= xxxx
|
||||
|
||||
# Google Drive, Right click on folder, Get link:
|
||||
# https://drive.google.com/drive/folders/123456789987654321abcdefghijk?usp=sharing
|
||||
# we want: 123456789987654321abcdefghijk
|
||||
# Remember to share the folder with the service email
|
||||
# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com
|
||||
GD_ROOT_FOLDER_ID=
|
||||
|
|
@ -10,4 +10,10 @@ anu.html
|
|||
*.log
|
||||
.pytest_cach
|
||||
anon*
|
||||
config*.json
|
||||
config.json
|
||||
config-*.json
|
||||
config.yaml
|
||||
config-*.yaml
|
||||
logs/*
|
||||
local_archive/
|
||||
vk_config*.json
|
7
Pipfile
7
Pipfile
|
@ -6,10 +6,9 @@ name = "pypi"
|
|||
[packages]
|
||||
gspread = "*"
|
||||
boto3 = "*"
|
||||
python-dotenv = "*"
|
||||
argparse = "*"
|
||||
beautifulsoup4 = "*"
|
||||
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
|
||||
tiktok-downloader = "*"
|
||||
bs4 = "*"
|
||||
loguru = "*"
|
||||
ffmpeg-python = "*"
|
||||
|
@ -21,6 +20,10 @@ google-api-python-client = "*"
|
|||
google-auth-httplib2 = "*"
|
||||
google-auth-oauthlib = "*"
|
||||
oauth2client = "*"
|
||||
python-slugify = "*"
|
||||
pyyaml = "*"
|
||||
vk-api = "*"
|
||||
dateparser = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "25b858227d74cc232bba525d34dcf30f15d18d535a6e9c59555e85a0a2bd8c61"
|
||||
"sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -50,19 +50,19 @@
|
|||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:3fb956d097105a0fb98c29a622ff233fa8de68519aabd7088d7ffd36dfc33214",
|
||||
"sha256:b59a210fa6a87f0c755b40403ffc66b9b285680bbc5ad5245cf167e2def33620"
|
||||
"sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4",
|
||||
"sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.23.7"
|
||||
"version": "==1.24.9"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:0f4a467188644382856e96e85bff0b453442d5cf0c0f554154571a6e2468a005",
|
||||
"sha256:9f8d5e8d65b24d97fcb7804b84831e5627fceb52707167d2f496477675c98ded"
|
||||
"sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad",
|
||||
"sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.26.7"
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.27.9"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
|
@ -141,18 +141,18 @@
|
|||
},
|
||||
"cachetools": {
|
||||
"hashes": [
|
||||
"sha256:4ebbd38701cdfd3603d1f751d851ed248ab4570929f2d8a7ce69e30c420b141c",
|
||||
"sha256:8b3b8fa53f564762e5b221e9896798951e7f915513abf2ba072ce0f07f3f5a98"
|
||||
"sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757",
|
||||
"sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"
|
||||
],
|
||||
"markers": "python_version ~= '3.7'",
|
||||
"version": "==5.1.0"
|
||||
"version": "==5.2.0"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
|
||||
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==2022.5.18.1"
|
||||
},
|
||||
"cffi": {
|
||||
|
@ -215,7 +215,7 @@
|
|||
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
|
||||
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==2.0.12"
|
||||
},
|
||||
"click": {
|
||||
|
@ -233,6 +233,13 @@
|
|||
],
|
||||
"version": "==1.2.60"
|
||||
},
|
||||
"commonmark": {
|
||||
"hashes": [
|
||||
"sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60",
|
||||
"sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"
|
||||
],
|
||||
"version": "==0.9.1"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:093cb351031656d3ee2f4fa1be579a8c69c754cf874206be1d4cf3b542042804",
|
||||
|
@ -260,13 +267,13 @@
|
|||
],
|
||||
"version": "==37.0.2"
|
||||
},
|
||||
"faker": {
|
||||
"dateparser": {
|
||||
"hashes": [
|
||||
"sha256:c6ff91847d7c820afc0a74d95e824b48aab71ddfd9003f300641e42d58ae886f",
|
||||
"sha256:cad1f69d72a68878cd67855140b6fe3e44c11628971cd838595d289c98bc45de"
|
||||
"sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9",
|
||||
"sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==13.11.1"
|
||||
"index": "pypi",
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"ffmpeg-python": {
|
||||
"hashes": [
|
||||
|
@ -278,11 +285,11 @@
|
|||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
"sha256:b795f1b42a61bbf8ec7113c341dad679d772567b936fbd1bf43c9a238e673e20",
|
||||
"sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6"
|
||||
"sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404",
|
||||
"sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.7.0"
|
||||
"version": "==3.7.1"
|
||||
},
|
||||
"flask": {
|
||||
"hashes": [
|
||||
|
@ -301,27 +308,27 @@
|
|||
},
|
||||
"google-api-core": {
|
||||
"hashes": [
|
||||
"sha256:065bb8e11c605fd232707ae50963dc1c8af5b3c95b4568887515985e6c1156b3",
|
||||
"sha256:1b9f59236ce1bae9a687c1d4f22957e79a2669e53d032893f6bf0fca54f6931d"
|
||||
"sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0",
|
||||
"sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.8.0"
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==2.8.1"
|
||||
},
|
||||
"google-api-python-client": {
|
||||
"hashes": [
|
||||
"sha256:4527f7b8518a795624ab68da412d55628f83b98c67dd6a5d6edf725454f8b30b",
|
||||
"sha256:600c43d7eac6e3536fdcad1d14ba9ee503edf4c7db0bd827e791bbf03b9f1330"
|
||||
"sha256:a573373041b3f6ccbd04877b70e7425c52daec5b4fe5f440e8f5895c87d1a69c",
|
||||
"sha256:b444f839bed289ecfe30950ea1cd15b7e7976d8cf9f0a3c778037ae3fb030df3"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.48.0"
|
||||
"version": "==2.51.0"
|
||||
},
|
||||
"google-auth": {
|
||||
"hashes": [
|
||||
"sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312",
|
||||
"sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d"
|
||||
"sha256:819b70140d05501739e1387291d39f0de3b4dff3b00ae4aff8e7a05369957f89",
|
||||
"sha256:9b1da39ab8731c3061f36fefde9f8bb902dbee9eb28e3a67e8cfa7dc1be76227"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==2.6.6"
|
||||
"version": "==2.8.0"
|
||||
},
|
||||
"google-auth-httplib2": {
|
||||
"hashes": [
|
||||
|
@ -333,34 +340,34 @@
|
|||
},
|
||||
"google-auth-oauthlib": {
|
||||
"hashes": [
|
||||
"sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
|
||||
"sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
|
||||
"sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f",
|
||||
"sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.5.1"
|
||||
"version": "==0.5.2"
|
||||
},
|
||||
"googleapis-common-protos": {
|
||||
"hashes": [
|
||||
"sha256:6b5ee59dc646eb61a8eb65ee1db186d3df6687c8804830024f32573298bca19b",
|
||||
"sha256:ddcd955b5bb6589368f659fa475373faa1ed7d09cde5ba25e88513d87007e174"
|
||||
"sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c",
|
||||
"sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.56.1"
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==1.56.2"
|
||||
},
|
||||
"gspread": {
|
||||
"hashes": [
|
||||
"sha256:319766d90db05056293f7ee0ad2b35503a1a40683a75897a2922398cd2016283",
|
||||
"sha256:c719e1c024a2a6f3b7d818fbe07c3886b26fd6504b64d1b1359cf242968213cd"
|
||||
"sha256:21704b47d007c3b5fd34eddfa4c4a9dcd1ecc1dc615083b9c636127726e66c18",
|
||||
"sha256:b6172b62fa899e3e4199d2d0ea1008b64305554ba08d3d3a96e9123824fdec48"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==5.3.2"
|
||||
"version": "==5.4.0"
|
||||
},
|
||||
"h11": {
|
||||
"hashes": [
|
||||
"sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06",
|
||||
"sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==0.13.0"
|
||||
},
|
||||
"httplib2": {
|
||||
|
@ -376,7 +383,7 @@
|
|||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
|
@ -421,70 +428,72 @@
|
|||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
"sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
|
||||
"sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
|
||||
"sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
|
||||
"sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
|
||||
"sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
|
||||
"sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
|
||||
"sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
|
||||
"sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
|
||||
"sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
|
||||
"sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
|
||||
"sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
|
||||
"sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
|
||||
"sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
|
||||
"sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
|
||||
"sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
|
||||
"sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
|
||||
"sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
|
||||
"sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
|
||||
"sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
|
||||
"sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
|
||||
"sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
|
||||
"sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
|
||||
"sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
|
||||
"sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
|
||||
"sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
|
||||
"sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
|
||||
"sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
|
||||
"sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
|
||||
"sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
|
||||
"sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
|
||||
"sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
|
||||
"sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
|
||||
"sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
|
||||
"sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
|
||||
"sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
|
||||
"sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
|
||||
"sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
|
||||
"sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
|
||||
"sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
|
||||
"sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
|
||||
"sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
|
||||
"sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
|
||||
"sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
|
||||
"sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
|
||||
"sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
|
||||
"sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
|
||||
"sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
|
||||
"sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
|
||||
"sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
|
||||
"sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
|
||||
"sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
|
||||
"sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
|
||||
"sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
|
||||
"sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
|
||||
"sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
|
||||
"sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
|
||||
"sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
|
||||
"sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
|
||||
"sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
|
||||
"sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
|
||||
"sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
|
||||
"sha256:00f3a6f88fd5f4357844dd91a1abac5f466c6799f1b7f1da2df6665253845b11",
|
||||
"sha256:024684e0c5cfa121c22140d3a0898a3a9b2ea0f0fd2c229b6658af4bdf1155e5",
|
||||
"sha256:03370ec37fe562238d385e2c53089076dee53aabf8325cab964fdb04a9130fa0",
|
||||
"sha256:0aa4cce579512c33373ca4c5e23c21e40c1aa1a33533a75e51b654834fd0e4f2",
|
||||
"sha256:1057356b808d149bc14eb8f37bb89129f237df488661c1e0fc0376ca90e1d2c3",
|
||||
"sha256:11d62c97ceff9bab94b6b29c010ea5fb6831743459bb759c917f49ba75601cd0",
|
||||
"sha256:1254a79f8a67a3908de725caf59eae62d86738f6387b0a34b32e02abd6ae73db",
|
||||
"sha256:1bfb791a8fcdbf55d1d41b8be940393687bec0e9b12733f0796668086d1a23ff",
|
||||
"sha256:28cf04a1a38e961d4a764d2940af9b941b66263ed5584392ef875ee9c1e360a3",
|
||||
"sha256:2b9c2341d96926b0d0e132e5c49ef85eb53fa92ae1c3a70f9072f3db0d32bc07",
|
||||
"sha256:2d10659e6e5c53298e6d718fd126e793285bff904bb71d7239a17218f6a197b7",
|
||||
"sha256:3af00ee88376022589ceeb8170eb67dacf5f7cd625ea59fa0977d719777d4ae8",
|
||||
"sha256:3cf816aed8125cfc9e6e5c6c31ff94278320d591bd7970c4a0233bee0d1c8790",
|
||||
"sha256:4becd16750ca5c2a1b1588269322b2cebd10c07738f336c922b658dbab96a61c",
|
||||
"sha256:4cd69bca464e892ea4ed544ba6a7850aaff6f8d792f8055a10638db60acbac18",
|
||||
"sha256:4e97c8fc761ad63909198acc892f34c20f37f3baa2c50a62d5ec5d7f1efc68a1",
|
||||
"sha256:520461c36727268a989790aef08884347cd41f2d8ae855489ccf40b50321d8d7",
|
||||
"sha256:53b0410b220766321759f7f9066da67b1d0d4a7f6636a477984cbb1d98483955",
|
||||
"sha256:56e19fb6e4b8bd07fb20028d03d3bc67bcc0621347fbde64f248e44839771756",
|
||||
"sha256:5a49ad78543925e1a4196e20c9c54492afa4f1502c2a563f73097e2044c75190",
|
||||
"sha256:5d52e1173f52020392f593f87a6af2d4055dd800574a5cb0af4ea3878801d307",
|
||||
"sha256:607224ffae9a0cf0a2f6e14f5f6bce43e83a6fbdaa647891729c103bdd6a5593",
|
||||
"sha256:612ef8f2795a89ba3a1d4c8c1af84d8453fd53ee611aa5ad460fdd2cab426fc2",
|
||||
"sha256:615886ee84b6f42f1bdf1852a9669b5fe3b96b6ff27f1a7a330b67ad9911200a",
|
||||
"sha256:63419db39df8dc5564f6f103102c4665f7e4d9cb64030e98cf7a74eae5d5760d",
|
||||
"sha256:6467626fa74f96f4d80fc6ec2555799e97fff8f36e0bfc7f67769f83e59cff40",
|
||||
"sha256:65b3b5f12c6fb5611e79157214f3cd533083f9b058bf2fc8a1c5cc5ee40fdc5a",
|
||||
"sha256:686565ac77ff94a8965c11829af253d9e2ce3bf0d9225b1d2eb5c4d4666d0dca",
|
||||
"sha256:6af7f51a6010748fc1bb71917318d953c9673e4ae3f6d285aaf93ef5b2eb11c1",
|
||||
"sha256:70a198030d26f5e569367f0f04509b63256faa76a22886280eea69a4f535dd40",
|
||||
"sha256:754a1dd04bff8a509a31146bd8f3a5dc8191a8694d582dd5fb71ff09f0722c22",
|
||||
"sha256:75da29a0752c8f2395df0115ac1681cefbdd4418676015be8178b733704cbff2",
|
||||
"sha256:81c29c8741fa07ecec8ec7417c3d8d1e2f18cf5a10a280f4e1c3f8c3590228b2",
|
||||
"sha256:9093a359a86650a3dbd6532c3e4d21a6f58ba2cb60d0e72db0848115d24c10ba",
|
||||
"sha256:915ecf7d486df17cc65aeefdb680d5ad4390cc8c857cf8db3fe241ed234f856a",
|
||||
"sha256:94b181dd2777890139e49a5336bf3a9a3378ce66132c665fe8db4e8b7683cde2",
|
||||
"sha256:94f2e45b054dd759bed137b6e14ae8625495f7d90ddd23cf62c7a68f72b62656",
|
||||
"sha256:9af19eb789d674b59a9bee5005779757aab857c40bf9cc313cb01eafac55ce55",
|
||||
"sha256:9cae837b988f44925d14d048fa6a8c54f197c8b1223fd9ee9c27084f84606143",
|
||||
"sha256:aa7447bf7c1a15ef24e2b86a277b585dd3f055e8890ac7f97374d170187daa97",
|
||||
"sha256:b1e22f3ee4d75ca261b6bffbf64f6f178cb194b1be3191065a09f8d98828daa9",
|
||||
"sha256:b5031d151d6147eac53366d6ec87da84cd4d8c5e80b1d9948a667a7164116e39",
|
||||
"sha256:b62d1431b4c40cda43cc986f19b8c86b1d2ae8918cfc00f4776fdf070b65c0c4",
|
||||
"sha256:b71c52d69b91af7d18c13aef1b0cc3baee36b78607c711eb14a52bf3aa7c815e",
|
||||
"sha256:b7679344f2270840dc5babc9ccbedbc04f7473c1f66d4676bb01680c0db85bcc",
|
||||
"sha256:bb7c1b029e54e26e01b1d1d912fc21abb65650d16ea9a191d026def4ed0859ed",
|
||||
"sha256:c2a57755e366e0ac7ebdb3e9207f159c3bf1afed02392ab18453ce81f5ee92ee",
|
||||
"sha256:cf9ec915857d260511399ab87e1e70fa13d6b2972258f8e620a3959468edfc32",
|
||||
"sha256:d0d03b9636f1326772e6854459728676354d4c7731dae9902b180e2065ba3da6",
|
||||
"sha256:d1690c4d37674a5f0cdafbc5ed7e360800afcf06928c2a024c779c046891bf09",
|
||||
"sha256:d76da27f5e3e9bc40eba6ed7a9e985f57547e98cf20521d91215707f2fb57e0f",
|
||||
"sha256:d882c2f3345261e898b9f604be76b61c901fbfa4ac32e3f51d5dc1edc89da3cb",
|
||||
"sha256:d8e5021e770b0a3084c30dda5901d5fce6d4474feaf0ced8f8e5a82702502fbb",
|
||||
"sha256:dd00d28d1ab5fa7627f5abc957f29a6338a7395b724571a8cbff8fbed83aaa82",
|
||||
"sha256:e35a298691b9e10e5a5631f8f0ba605b30ebe19208dc8f58b670462f53753641",
|
||||
"sha256:e4d020ecf3740b7312bacab2cb966bb720fd4d3490562d373b4ad91dd1857c0d",
|
||||
"sha256:e564d5a771b4015f34166a05ea2165b7e283635c41b1347696117f780084b46d",
|
||||
"sha256:ea3f2e9eb41f973f73619e88bf7bd950b16b4c2ce73d15f24a11800ce1eaf276",
|
||||
"sha256:eabdbe04ee0a7e760fa6cd9e799d2b020d098c580ba99107d52e1e5e538b1ecb",
|
||||
"sha256:f17b9df97c5ecdfb56c5e85b3c9df9831246df698f8581c6e111ac664c7c656e",
|
||||
"sha256:f386def57742aacc3d864169dfce644a8c396f95aa35b41b69df53f558d56dd0",
|
||||
"sha256:f6d23a01921b741774f35e924d418a43cf03eca1444f3fdfd7978d35a5aaab8b",
|
||||
"sha256:fcdf70191f0d1761d190a436db06a46f05af60e1410e1507935f0332280c9268"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==4.8.0"
|
||||
"version": "==4.9.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
|
@ -553,16 +562,16 @@
|
|||
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
|
||||
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==3.2.0"
|
||||
},
|
||||
"outcome": {
|
||||
"hashes": [
|
||||
"sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958",
|
||||
"sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"
|
||||
"sha256:6f82bd3de45da303cf1f771ecafa1633750a358436a8bb60e06a1ceb745d2672",
|
||||
"sha256:c4ab89a56575d6d38a05aa16daeaa333109c1f96167aba8901ab18b6b5e0f7f5"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.1.0"
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.2.0"
|
||||
},
|
||||
"protobuf": {
|
||||
"hashes": [
|
||||
|
@ -676,6 +685,14 @@
|
|||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==3.14.1"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb",
|
||||
"sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==2.12.0"
|
||||
},
|
||||
"pyopenssl": {
|
||||
"hashes": [
|
||||
"sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf",
|
||||
|
@ -688,7 +705,7 @@
|
|||
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
|
||||
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.8'",
|
||||
"markers": "python_version >= '3.1'",
|
||||
"version": "==3.0.9"
|
||||
},
|
||||
"pysocks": {
|
||||
|
@ -707,21 +724,156 @@
|
|||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.8.2"
|
||||
},
|
||||
"python-dotenv": {
|
||||
"python-slugify": {
|
||||
"hashes": [
|
||||
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
|
||||
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
|
||||
"sha256:272d106cb31ab99b3496ba085e3fea0e9e76dcde967b5e9992500d1f785ce4e1",
|
||||
"sha256:7b2c274c308b62f4269a9ba701aa69a797e9bca41aeee5b3a9e79e36b6656927"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.20.0"
|
||||
"version": "==6.1.2"
|
||||
},
|
||||
"requests": {
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
|
||||
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
|
||||
],
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"pytz-deprecation-shim": {
|
||||
"hashes": [
|
||||
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
|
||||
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==2.27.1"
|
||||
"version": "==0.1.0.post0"
|
||||
},
|
||||
"pyyaml": {
|
||||
"hashes": [
|
||||
"sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293",
|
||||
"sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b",
|
||||
"sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57",
|
||||
"sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b",
|
||||
"sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4",
|
||||
"sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07",
|
||||
"sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba",
|
||||
"sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9",
|
||||
"sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287",
|
||||
"sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513",
|
||||
"sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0",
|
||||
"sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0",
|
||||
"sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92",
|
||||
"sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f",
|
||||
"sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2",
|
||||
"sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc",
|
||||
"sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c",
|
||||
"sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86",
|
||||
"sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4",
|
||||
"sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c",
|
||||
"sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34",
|
||||
"sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b",
|
||||
"sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c",
|
||||
"sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb",
|
||||
"sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737",
|
||||
"sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3",
|
||||
"sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d",
|
||||
"sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53",
|
||||
"sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78",
|
||||
"sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803",
|
||||
"sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a",
|
||||
"sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174",
|
||||
"sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.0"
|
||||
},
|
||||
"regex": {
|
||||
"hashes": [
|
||||
"sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14",
|
||||
"sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9",
|
||||
"sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204",
|
||||
"sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f",
|
||||
"sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737",
|
||||
"sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b",
|
||||
"sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3",
|
||||
"sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4",
|
||||
"sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac",
|
||||
"sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f",
|
||||
"sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29",
|
||||
"sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772",
|
||||
"sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1",
|
||||
"sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863",
|
||||
"sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66",
|
||||
"sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed",
|
||||
"sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47",
|
||||
"sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f",
|
||||
"sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f",
|
||||
"sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008",
|
||||
"sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d",
|
||||
"sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571",
|
||||
"sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0",
|
||||
"sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a",
|
||||
"sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3",
|
||||
"sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7",
|
||||
"sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447",
|
||||
"sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493",
|
||||
"sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4",
|
||||
"sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede",
|
||||
"sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640",
|
||||
"sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd",
|
||||
"sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c",
|
||||
"sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee",
|
||||
"sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30",
|
||||
"sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b",
|
||||
"sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec",
|
||||
"sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1",
|
||||
"sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e",
|
||||
"sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8",
|
||||
"sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9",
|
||||
"sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231",
|
||||
"sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7",
|
||||
"sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729",
|
||||
"sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960",
|
||||
"sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056",
|
||||
"sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357",
|
||||
"sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7",
|
||||
"sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3",
|
||||
"sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7",
|
||||
"sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573",
|
||||
"sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0",
|
||||
"sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178",
|
||||
"sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f",
|
||||
"sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834",
|
||||
"sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c",
|
||||
"sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015",
|
||||
"sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0",
|
||||
"sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57",
|
||||
"sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635",
|
||||
"sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07",
|
||||
"sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2",
|
||||
"sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1",
|
||||
"sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b",
|
||||
"sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2",
|
||||
"sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5",
|
||||
"sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b",
|
||||
"sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86",
|
||||
"sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5",
|
||||
"sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93",
|
||||
"sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0",
|
||||
"sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f",
|
||||
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
|
||||
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [],
|
||||
"hashes": [
|
||||
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
|
||||
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
|
||||
],
|
||||
"markers": "python_version >= '3.7' and python_version < '4'",
|
||||
"version": "==2.28.0"
|
||||
},
|
||||
"requests-oauthlib": {
|
||||
"hashes": [
|
||||
|
@ -738,28 +890,36 @@
|
|||
],
|
||||
"version": "==0.9.1"
|
||||
},
|
||||
"rich": {
|
||||
"hashes": [
|
||||
"sha256:4c586de507202505346f3e32d1363eb9ed6932f0c2f63184dea88983ff4971e2",
|
||||
"sha256:d2bbd99c320a2532ac71ff6a3164867884357da3e3301f0240090c5d2fdac7ec"
|
||||
],
|
||||
"markers": "python_version < '4' and python_full_version >= '3.6.3'",
|
||||
"version": "==12.4.4"
|
||||
},
|
||||
"rsa": {
|
||||
"hashes": [
|
||||
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
|
||||
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '4'",
|
||||
"markers": "python_version < '4' and python_full_version >= '3.6.0'",
|
||||
"version": "==4.8"
|
||||
},
|
||||
"s3transfer": {
|
||||
"hashes": [
|
||||
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
|
||||
"sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"
|
||||
"sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
|
||||
"sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.5.2"
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"selenium": {
|
||||
"hashes": [
|
||||
"sha256:866b6dd6c459210662bff922ee7c33162d21920fbf6811e8e5a52be3866a687f"
|
||||
"sha256:ba5b2633f43cf6fe9d308fa4a6996e00a101ab9cb1aad6fd91ae1f3dbe57f56f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.1.5"
|
||||
"version": "==4.2.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
|
@ -797,7 +957,7 @@
|
|||
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
|
||||
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==2.3.2.post1"
|
||||
},
|
||||
"telethon": {
|
||||
|
@ -808,17 +968,35 @@
|
|||
"index": "pypi",
|
||||
"version": "==1.24.0"
|
||||
},
|
||||
"text-unidecode": {
|
||||
"hashes": [
|
||||
"sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8",
|
||||
"sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"
|
||||
],
|
||||
"version": "==1.3"
|
||||
},
|
||||
"tiktok-downloader": {
|
||||
"git": "https://github.com/msramalho/tiktok-downloader",
|
||||
"ref": "7bd8bb331d00ebdc317b8cc9c28ecbd83c89e03c"
|
||||
"hashes": [
|
||||
"sha256:48fe204df962893a60360a20b13da133bc22bdbfec87c3cd3a9157f138785242"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.3.3"
|
||||
},
|
||||
"tqdm": {
|
||||
"hashes": [
|
||||
"sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d",
|
||||
"sha256:74a2cdefe14d11442cedf3ba4e21a3b84ff9a2dbdc6cfae2c34addb2a14a5ea6"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==4.64.0"
|
||||
},
|
||||
"trio": {
|
||||
"hashes": [
|
||||
"sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070",
|
||||
"sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a"
|
||||
"sha256:4dc0bf9d5cc78767fc4516325b6d80cc0968705a31d0eec2ecd7cdda466265b0",
|
||||
"sha256:523f39b7b69eef73501cebfe1aafd400a9aad5b03543a0eded52952488ff1c13"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==0.20.0"
|
||||
"version": "==0.21.0"
|
||||
},
|
||||
"trio-websocket": {
|
||||
"hashes": [
|
||||
|
@ -828,15 +1006,35 @@
|
|||
"markers": "python_version >= '3.5'",
|
||||
"version": "==0.9.2"
|
||||
},
|
||||
"tzdata": {
|
||||
"hashes": [
|
||||
"sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9",
|
||||
"sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"tzlocal": {
|
||||
"hashes": [
|
||||
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
|
||||
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==4.2"
|
||||
},
|
||||
"uritemplate": {
|
||||
"hashes": [
|
||||
"sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
|
||||
"sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"markers": "python_full_version >= '3.6.0'",
|
||||
"version": "==4.1.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"extras": [
|
||||
"secure",
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
|
||||
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
|
||||
|
@ -844,6 +1042,14 @@
|
|||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.9"
|
||||
},
|
||||
"vk-api": {
|
||||
"hashes": [
|
||||
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc",
|
||||
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==11.9.8"
|
||||
},
|
||||
"websockets": {
|
||||
"hashes": [
|
||||
"sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",
|
||||
|
|
151
README.md
151
README.md
|
@ -1,50 +1,116 @@
|
|||
# auto-archiver
|
||||
|
||||
This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis.
|
||||
Python script to automatically archive social media posts, videos, and images from a Google Sheets document. Uses different archivers depending on the platform, and can save content to local storage, S3 bucket (Digital Ocean Spaces, AWS, ...), and Google Drive. The Google Sheets where the links come from is updated with information about the archived content. It can be run manually or on an automated basis.
|
||||
|
||||
## Setup
|
||||
|
||||
If you are using `pipenv` (recommended), `pipenv install` is sufficient to install Python prerequisites.
|
||||
|
||||
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
|
||||
You also need:
|
||||
1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
|
||||
2. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
|
||||
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
|
||||
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
|
||||
|
||||
[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
|
||||
### Configuration file
|
||||
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
|
||||
|
||||
[firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
|
||||
<details><summary><code>python auto_archive.py --help</code></summary>
|
||||
|
||||
[fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||
|
||||
A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
|
||||
|
||||
```
|
||||
DO_SPACES_REGION=
|
||||
DO_BUCKET=
|
||||
DO_SPACES_KEY=
|
||||
DO_SPACES_SECRET=
|
||||
INTERNET_ARCHIVE_S3_KEY=
|
||||
INTERNET_ARCHIVE_S3_SECRET=
|
||||
TELEGRAM_API_ID=
|
||||
TELEGRAM_API_HASH=
|
||||
FACEBOOK_COOKIE=
|
||||
GD_ROOT_FOLDER_ID=
|
||||
```js
|
||||
usage: auto_archive.py [-h] [--config CONFIG] [--storage {s3,local,gd}] [--sheet SHEET] [--header HEADER] [--check-if-exists] [--save-logs] [--s3-private] [--col-url URL] [--col-status STATUS] [--col-folder FOLDER]
|
||||
[--col-archive ARCHIVE] [--col-date DATE] [--col-thumbnail THUMBNAIL] [--col-thumbnail_index THUMBNAIL_INDEX] [--col-timestamp TIMESTAMP] [--col-title TITLE] [--col-duration DURATION]
|
||||
[--col-screenshot SCREENSHOT] [--col-hash HASH]
|
||||
|
||||
Automatically archive social media posts, videos, and images from a Google Sheets document.
|
||||
The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options
|
||||
are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG the filename of the YAML configuration file (defaults to 'config.yaml')
|
||||
--storage {s3,local,gd}
|
||||
which storage to use [execution.storage in config.yaml]
|
||||
--sheet SHEET the name of the google sheets document [execution.sheet in config.yaml]
|
||||
--header HEADER 1-based index for the header row [execution.header in config.yaml]
|
||||
--check-if-exists when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]
|
||||
--save-logs creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]
|
||||
--s3-private Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]
|
||||
--col-url URL the name of the column to READ url FROM (default='link')
|
||||
--col-status STATUS the name of the column to FILL WITH status (default='archive status')
|
||||
--col-folder FOLDER the name of the column to READ folder FROM (default='destination folder')
|
||||
--col-archive ARCHIVE
|
||||
the name of the column to FILL WITH archive (default='archive location')
|
||||
--col-date DATE the name of the column to FILL WITH date (default='archive date')
|
||||
--col-thumbnail THUMBNAIL
|
||||
the name of the column to FILL WITH thumbnail (default='thumbnail')
|
||||
--col-thumbnail_index THUMBNAIL_INDEX
|
||||
the name of the column to FILL WITH thumbnail_index (default='thumbnail index')
|
||||
--col-timestamp TIMESTAMP
|
||||
the name of the column to FILL WITH timestamp (default='upload timestamp')
|
||||
--col-title TITLE the name of the column to FILL WITH title (default='upload title')
|
||||
--col-duration DURATION
|
||||
the name of the column to FILL WITH duration (default='duration')
|
||||
--col-screenshot SCREENSHOT
|
||||
the name of the column to FILL WITH screenshot (default='screenshot')
|
||||
--col-hash HASH the name of the column to FILL WITH hash (default='hash')
|
||||
```
|
||||
|
||||
`.example.env` is an example of this file
|
||||
</details><br/>
|
||||
|
||||
#### Example invocations
|
||||
All the configurations can be specified in the YAML config file, but sometimes it is useful to override only some of those like the sheet that we are running the archival on, here are some examples (possibly prepended by `pipenv run`):
|
||||
|
||||
```bash
|
||||
# all the configurations come from config.yaml
|
||||
python auto_archive.py
|
||||
|
||||
# all the configurations come from config.yaml,
|
||||
# checks if URL is not archived twice and saves logs to logs/ folder
|
||||
python auto_archive.py --check-if-exists --save_logs
|
||||
|
||||
# all the configurations come from my_config.yaml
|
||||
python auto_archive.py --config my_config.yaml
|
||||
|
||||
# reads the configurations but saves archived content to google drive instead
|
||||
python auto_archive.py --config my_config.yaml --storage gd
|
||||
|
||||
# uses the configurations but for another google docs sheet
|
||||
# with a header on row 2 and with some different column names
|
||||
python auto_archive.py --config my_config.yaml --sheet="use it on another sheets doc" --header=2 --col-link="put urls here"
|
||||
|
||||
# all the configurations come from config.yaml and specifies that s3 files should be private
|
||||
python auto_archive.py --s3-private
|
||||
```
|
||||
|
||||
### Extra notes on configuration
|
||||
#### Google Drive
|
||||
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
|
||||
|
||||
#### Telethon (Telegrams API Library)
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
|
||||
|
||||
## Running
|
||||
|
||||
There is just one necessary command line flag, `--sheet name` which the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. This sheet must also have specific columns in the first row:
|
||||
* `Media URL` (required): the location of the media to be archived. This is the only column that should be supplied with data initially
|
||||
The `--sheet name` property (or `execution.sheet` in the YAML file) is the name of the Google Sheet to check for URLs.
|
||||
This sheet must have been shared with the Google Service account used by `gspread`.
|
||||
This sheet must also have specific columns (case-insensitive) in the `header` row (see `COLUMN_NAMES` in [gworksheet.py](utils/gworksheet.py)), only the `link` and `status` columns are mandatory:
|
||||
* `Link` (required): the location of the media to be archived. This is the only column that should be supplied with data initially
|
||||
* `Archive status` (required): the status of the auto archiver script. Any row with text in this column will be skipped automatically.
|
||||
* `Archive location` (required): the location of the archived version. For files that were not able to be auto archived, this can be manually updated.
|
||||
* `Destination folder`: (optional) by default files are saved to a folder called `name-of-sheets-document/name-of-sheets-tab/` using this option you can organize documents into folder from the sheet.
|
||||
* `Archive location`: the location of the archived version. For files that were not able to be auto archived, this can be manually updated.
|
||||
* `Archive date`: the date that the auto archiver script ran for this file
|
||||
* `Upload timestamp`: the timestamp extracted from the video. (For YouTube, this unfortunately does not currently include the time)
|
||||
* `Duration`: the duration of the video
|
||||
* `Upload title`: the "title" of the video from the original source
|
||||
* `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible)
|
||||
* `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content
|
||||
* `Hash`: a hash of the first video or image found
|
||||
* `Screenshot`: a screenshot taken with from a browser view of opening the page
|
||||
* in case of videos
|
||||
* `Duration`: duration in seconds
|
||||
* `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible)
|
||||
* `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content
|
||||
|
||||
|
||||
For example, for use with this spreadsheet:
|
||||
|
||||
|
@ -74,7 +140,7 @@ With this configuration, the archiver should archive and store all media added t
|
|||
|
||||
# auto_auto_archiver
|
||||
|
||||
To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) This script takes one command line argument, with `--sheet`, the name of the sheet. It must be shared with the same service account.
|
||||
To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) You can simply use your default config as for `auto_archiver.py` but use `--sheet` to specify the name of the sheet that lists the names of sheets to archive.It must be shared with the same service account.
|
||||
|
||||

|
||||
|
||||
|
@ -86,37 +152,24 @@ Code is split into functional concepts:
|
|||
1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
|
||||
|
||||
### Current Archivers
|
||||
Archivers are tested in a meaningful order with Wayback Machine being the failsafe, that can easily be changed in the code.
|
||||
```mermaid
|
||||
graph TD
|
||||
A(Archiver) -->|parent of| B(TelegramArchiver)
|
||||
A -->|parent of| C(TikTokArchiver)
|
||||
A(Archiver) -->|parent of| B(TelethonArchiver)
|
||||
A -->|parent of| C(TiktokArchiver)
|
||||
A -->|parent of| D(YoutubeDLArchiver)
|
||||
A -->|parent of| E(WaybackArchiver)
|
||||
A -->|parent of| E(TelegramArchiver)
|
||||
A -->|parent of| F(TwitterArchiver)
|
||||
A -->|parent of| G(VkArchiver)
|
||||
A -->|parent of| H(WaybackArchiver)
|
||||
```
|
||||
### Current Storages
|
||||
```mermaid
|
||||
graph TD
|
||||
A(BaseStorage) -->|parent of| B(S3Storage)
|
||||
A(BaseStorage) -->|parent of| C(GoogleDriveStorage)
|
||||
A(BaseStorage) -->|parent of| C(LocalStorage)
|
||||
A(BaseStorage) -->|parent of| D(GoogleDriveStorage)
|
||||
```
|
||||
|
||||
## Saving into Subfolders
|
||||
|
||||
You can have a column in the spreadsheet for the argument `--col-subfolder` that is passed to the storage and can specify a subfolder to put the archived link into.
|
||||
|
||||
## Google Drive
|
||||
|
||||
To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com`
|
||||
|
||||
```bash
|
||||
python auto_archive.py --sheet 'Sheet Name' --storage='gd'
|
||||
```
|
||||
|
||||
## Telethon (Telegrams API Library)
|
||||
|
||||
Put your `anon.session` in the root, so that it doesn't stall and ask for authentication
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .base_archiver import *
|
||||
from .telegram_archiver import *
|
||||
from .telethon_archiver import *
|
||||
from .tiktok_archiver import *
|
||||
from .wayback_archiver import *
|
||||
from .youtubedl_archiver import *
|
||||
from .twitter_archiver import *
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from .telegram_archiver import TelegramArchiver
|
||||
from .telethon_archiver import TelethonArchiver
|
||||
from .tiktok_archiver import TiktokArchiver
|
||||
from .wayback_archiver import WaybackArchiver
|
||||
from .youtubedl_archiver import YoutubeDLArchiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
from .vk_archiver import VkArchiver
|
|
@ -1,23 +1,18 @@
|
|||
import os
|
||||
import ffmpeg
|
||||
import datetime
|
||||
import shutil
|
||||
import os, datetime, shutil, hashlib, time, requests, re, mimetypes
|
||||
from dataclasses import dataclass
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
import hashlib
|
||||
import time
|
||||
import requests
|
||||
from random import randrange
|
||||
|
||||
import ffmpeg
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.by import By
|
||||
from slugify import slugify
|
||||
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArchiveResult:
|
||||
|
@ -34,6 +29,7 @@ class ArchiveResult:
|
|||
|
||||
class Archiver(ABC):
|
||||
name = "default"
|
||||
retry_regex = r"retrying at (\d+)$"
|
||||
|
||||
def __init__(self, storage: Storage, driver):
|
||||
self.storage = storage
|
||||
|
@ -42,30 +38,39 @@ class Archiver(ABC):
|
|||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
@abstractmethod
|
||||
def download(self, url, check_if_exists=False): pass
|
||||
|
||||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
|
||||
def get_html_key(self, url):
|
||||
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
|
||||
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||
"""
|
||||
Generates an index.html page where each @urls_info is displayed
|
||||
"""
|
||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||
<body>
|
||||
<h2>Archived media from {self.name}</h2>
|
||||
<h3><a href="{url}">{url}</a></h3><ul>'''
|
||||
|
||||
for url_info in urls_info:
|
||||
page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''
|
||||
mime_global = self._guess_file_type(url_info["key"])
|
||||
preview = ""
|
||||
if mime_global == "image":
|
||||
preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
|
||||
elif mime_global == "video":
|
||||
preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
|
||||
page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''
|
||||
|
||||
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
|
||||
page += f"</body></html>"
|
||||
|
||||
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
page_filename = 'tmp/' + page_key
|
||||
page_key = self.get_html_key(url)
|
||||
page_filename = os.path.join(Storage.TMP_FOLDER, page_key)
|
||||
|
||||
with open(page_filename, "w") as f:
|
||||
f.write(page)
|
||||
|
@ -78,8 +83,23 @@ class Archiver(ABC):
|
|||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
return (page_cdn, page_hash, thumbnail)
|
||||
|
||||
def _guess_file_type(self, path: str):
|
||||
"""
|
||||
Receives a URL or filename and returns global mimetype like 'image' or 'video'
|
||||
see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
|
||||
"""
|
||||
mime = mimetypes.guess_type(path)[0]
|
||||
if mime is not None:
|
||||
return mime.split("/")[0]
|
||||
return ""
|
||||
|
||||
# eg images in a tweet save to cloud storage
|
||||
|
||||
def generate_media_page(self, urls, url, object):
|
||||
"""
|
||||
For a list of media urls, fetch them, upload them
|
||||
and call self.generate_media_page_html with them
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
|
@ -87,26 +107,16 @@ class Archiver(ABC):
|
|||
thumbnail = None
|
||||
uploaded_media = []
|
||||
for media_url in urls:
|
||||
path = urlparse(media_url).path
|
||||
key = self.get_key(path.replace("/", "_"))
|
||||
if '.' not in path:
|
||||
key += '.jpg'
|
||||
key = self._get_key_from_url(media_url, ".jpg")
|
||||
|
||||
filename = 'tmp/' + key
|
||||
filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
|
||||
# eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
|
||||
d = requests.get(media_url, headers=headers)
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(d.content)
|
||||
|
||||
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
|
||||
# eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
if thumbnail is None:
|
||||
|
@ -130,21 +140,36 @@ class Archiver(ABC):
|
|||
|
||||
return f'{self.name}_{_id}{extension}'
|
||||
|
||||
def get_hash(self, filename):
|
||||
f = open(filename, "rb")
|
||||
bytes = f.read() # read entire file as bytes
|
||||
def get_html_key(self, url):
|
||||
return self._get_key_from_url(url, ".html")
|
||||
|
||||
# TODO: customizable hash
|
||||
hash = hashlib.sha256(bytes)
|
||||
# option to use SHA3_512 instead
|
||||
# hash = hashlib.sha3_512(bytes)
|
||||
f.close()
|
||||
def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
|
||||
"""
|
||||
Receives a URL and returns a slugified version of the URL path
|
||||
if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
|
||||
if @append_date is true, the key adds a timestamp after the URL slug and before the extension
|
||||
"""
|
||||
slug = slugify(urlparse(url).path)
|
||||
if append_datetime:
|
||||
slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
|
||||
if with_extension is not None:
|
||||
if "." not in slug:
|
||||
slug += with_extension
|
||||
return self.get_key(slug)
|
||||
|
||||
def get_hash(self, filename):
|
||||
with open(filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
# TODO: customizable hash
|
||||
hash = hashlib.sha256(bytes)
|
||||
# option to use SHA3_512 instead
|
||||
# hash = hashlib.sha3_512(bytes)
|
||||
return hash.hexdigest()
|
||||
|
||||
def get_screenshot(self, url):
|
||||
key = self.get_key(urlparse(url).path.replace(
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = 'tmp/' + key
|
||||
logger.debug(f"getting screenshot for {url=}")
|
||||
key = self._get_key_from_url(url, ".png", append_datetime=True)
|
||||
filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
|
||||
# Accept cookies popup dismiss for ytdlp video
|
||||
if 'facebook.com' in url:
|
||||
|
@ -154,7 +179,7 @@ class Archiver(ABC):
|
|||
foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
|
||||
foo.click()
|
||||
logger.debug(f'fb click worked')
|
||||
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
|
||||
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
|
||||
time.sleep(2)
|
||||
except:
|
||||
logger.warning(f'Failed on fb accept cookies for url {url}')
|
||||
|
@ -200,12 +225,11 @@ class Archiver(ABC):
|
|||
key = key_folder + fname
|
||||
|
||||
self.storage.upload(thumbnail_filename, key)
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
cdn_urls.append(cdn_url)
|
||||
|
||||
if len(cdn_urls) == 0:
|
||||
return ('None', 'None')
|
||||
return ('', '')
|
||||
|
||||
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
|
||||
|
||||
|
@ -230,3 +254,37 @@ class Archiver(ABC):
|
|||
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
|
||||
|
||||
return (key_thumb, thumb_index_cdn_url)
|
||||
|
||||
def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
|
||||
"""
|
||||
sets state to retry in random between (min_seconds, max_seconds)
|
||||
"""
|
||||
now = datetime.datetime.now().timestamp()
|
||||
retry_at = int(now + randrange(min_seconds, max_seconds))
|
||||
logger.debug(f"signaling {retry_at=}")
|
||||
return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
|
||||
|
||||
def is_retry(status):
|
||||
return re.search(Archiver.retry_regex, status) is not None
|
||||
|
||||
def should_retry_from_status(status):
|
||||
"""
|
||||
checks status against message in signal_retry_in
|
||||
returns true if enough time has elapsed, false otherwise
|
||||
"""
|
||||
match = re.search(Archiver.retry_regex, status)
|
||||
if match:
|
||||
retry_at = int(match.group(1))
|
||||
now = datetime.datetime.now().timestamp()
|
||||
should_retry = now >= retry_at
|
||||
logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")
|
||||
return should_retry
|
||||
return False
|
||||
|
||||
def remove_retry(status):
|
||||
"""
|
||||
transforms the status from retry into something else
|
||||
"""
|
||||
new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
|
||||
logger.debug(f"removing retry message at {status=}, got {new_status=}")
|
||||
return new_status
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import os
|
||||
import requests
|
||||
import os, requests, re
|
||||
|
||||
import html
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
import re
|
||||
import html
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from storages import Storage
|
||||
|
||||
|
||||
class TelegramArchiver(Archiver):
|
||||
|
@ -52,8 +52,7 @@ class TelegramArchiver(Archiver):
|
|||
video_id = video_url.split('/')[-1].split('?')[0]
|
||||
key = self.get_key(video_id)
|
||||
|
||||
filename = 'tmp/' + key
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
status = 'already archived'
|
||||
|
@ -84,5 +83,6 @@ class TelegramArchiver(Archiver):
|
|||
filename, key, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)
|
||||
|
|
|
@ -1,28 +1,25 @@
|
|||
import os
|
||||
import re
|
||||
import html
|
||||
from dataclasses import dataclass
|
||||
from loguru import logger
|
||||
import os, re
|
||||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
import html
|
||||
from loguru import logger
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.errors import ChannelInvalidError
|
||||
|
||||
|
||||
@dataclass
|
||||
class TelegramConfig:
|
||||
api_id: str
|
||||
api_hash: str
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import TelethonConfig
|
||||
from utils import getattr_or
|
||||
|
||||
|
||||
class TelethonArchiver(Archiver):
|
||||
name = "telethon"
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(.+)")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TelegramConfig):
|
||||
def __init__(self, storage: Storage, driver, config: TelethonConfig):
|
||||
super().__init__(storage, driver)
|
||||
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
|
||||
if config:
|
||||
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
|
||||
self.bot_token = config.bot_token
|
||||
|
||||
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
|
||||
"""
|
||||
|
@ -31,8 +28,8 @@ class TelethonArchiver(Archiver):
|
|||
of `max_amp` both ways
|
||||
Returns a list of [post] where each post has media and is in the same grouped_id
|
||||
"""
|
||||
if original_post.grouped_id is None:
|
||||
return [original_post] if original_post.media is not None else []
|
||||
if getattr_or(original_post, "grouped_id") is None:
|
||||
return [original_post] if getattr_or(original_post, "media") else []
|
||||
|
||||
search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
|
||||
posts = self.client.get_messages(chat, ids=search_ids)
|
||||
|
@ -43,16 +40,19 @@ class TelethonArchiver(Archiver):
|
|||
return media
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if not hasattr(self, "client"):
|
||||
logger.error('Missing Telethon config')
|
||||
return False
|
||||
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
if not len(matches):
|
||||
return False
|
||||
|
||||
status = "success"
|
||||
screenshot = self.get_screenshot(url)
|
||||
|
||||
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
|
||||
with self.client.start():
|
||||
with self.client.start(bot_token=self.bot_token):
|
||||
matches = list(matches[0])
|
||||
chat, post_id = matches[1], matches[2]
|
||||
|
||||
|
@ -61,16 +61,20 @@ class TelethonArchiver(Archiver):
|
|||
try:
|
||||
post = self.client.get_messages(chat, ids=post_id)
|
||||
except ValueError as e:
|
||||
logger.error(f'Could not fetch telegram {url} possibly it\'s private: {e}')
|
||||
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
||||
return False
|
||||
except ChannelInvalidError as e:
|
||||
# TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819
|
||||
logger.error(f'Could not fetch telegram {url} possibly it\'s private or not displayable in : {e}')
|
||||
logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}")
|
||||
return False
|
||||
|
||||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
if post is None: return False
|
||||
|
||||
if len(media_posts) > 1:
|
||||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
|
||||
if len(media_posts) > 0:
|
||||
key = self.get_html_key(url)
|
||||
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
|
@ -82,30 +86,22 @@ class TelethonArchiver(Archiver):
|
|||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
uploaded_media = []
|
||||
message = post.message
|
||||
for mp in media_posts:
|
||||
for i, mp in enumerate(media_posts):
|
||||
if len(mp.message) > len(message): message = mp.message
|
||||
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
|
||||
key = filename.split('tmp/')[1]
|
||||
filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id))
|
||||
filename = self.client.download_media(mp.media, filename_dest)
|
||||
key = filename.split(Storage.TMP_FOLDER)[1]
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
if i == 0:
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key)
|
||||
os.remove(filename)
|
||||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
||||
elif len(media_posts) == 1:
|
||||
key = self.get_key(f'{chat}_{post_id}')
|
||||
filename = self.client.download_media(post.media, f'tmp/{key}')
|
||||
key = filename.split('tmp/')[1].replace(" ", "")
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key)
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index)
|
||||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot)
|
||||
|
|
|
@ -3,6 +3,7 @@ import tiktok_downloader
|
|||
from loguru import logger
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from storages import Storage
|
||||
|
||||
|
||||
class TiktokArchiver(Archiver):
|
||||
|
@ -17,8 +18,8 @@ class TiktokArchiver(Archiver):
|
|||
try:
|
||||
info = tiktok_downloader.info_post(url)
|
||||
key = self.get_key(f'{info.id}.mp4')
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
filename = 'tmp/' + key
|
||||
filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
logger.info(f'found video {key=}')
|
||||
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
status = 'already archived'
|
||||
|
@ -27,13 +28,15 @@ class TiktokArchiver(Archiver):
|
|||
|
||||
if len(media) <= 0:
|
||||
if status == 'already archived':
|
||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
|
||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
|
||||
else:
|
||||
return ArchiveResult(status='Could not download media')
|
||||
|
||||
logger.info(f'downloading video {key=}')
|
||||
media[0].download(filename)
|
||||
|
||||
if status != 'already archived':
|
||||
logger.info(f'uploading video {key=}')
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
try:
|
||||
|
@ -49,10 +52,12 @@ class TiktokArchiver(Archiver):
|
|||
try: os.remove(filename)
|
||||
except FileNotFoundError:
|
||||
logger.info(f'tmp file not found thus not deleted {filename}')
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
timestamp = info.create.isoformat() if hasattr(info, "create") else None
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
|
||||
hash=hash, screenshot=screenshot)
|
||||
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
|
||||
timestamp=timestamp, hash=hash, screenshot=screenshot)
|
||||
|
||||
except tiktok_downloader.Except.InvalidUrl as e:
|
||||
status = 'Invalid URL'
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
from loguru import logger
|
||||
|
||||
import html
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
|
@ -11,6 +13,7 @@ class TwitterArchiver(Archiver):
|
|||
def download(self, url, check_if_exists=False):
|
||||
|
||||
if 'twitter.com' != self.get_netloc(url):
|
||||
logger.debug(f'{url=} is not from twitter')
|
||||
return False
|
||||
|
||||
tweet_id = urlparse(url).path.split('/')
|
||||
|
@ -18,6 +21,7 @@ class TwitterArchiver(Archiver):
|
|||
i = tweet_id.index('status')
|
||||
tweet_id = tweet_id[i + 1]
|
||||
else:
|
||||
logger.debug(f'{url=} does not contain "status"')
|
||||
return False
|
||||
|
||||
scr = TwitterTweetScraper(tweet_id)
|
||||
|
@ -29,8 +33,10 @@ class TwitterArchiver(Archiver):
|
|||
return False
|
||||
|
||||
if tweet.media is None:
|
||||
logger.trace(f'No media found')
|
||||
return False
|
||||
logger.debug(f'No media found, archiving tweet text only')
|
||||
screenshot = self.get_screenshot(url)
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
|
||||
|
||||
urls = []
|
||||
|
||||
|
@ -50,4 +56,4 @@ class TwitterArchiver(Archiver):
|
|||
|
||||
screenshot = self.get_screenshot(url)
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
import re, json, requests
|
||||
|
||||
import vk_api, dateparser
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import VkConfig
|
||||
|
||||
|
||||
class VkArchiver(Archiver):
|
||||
""""
|
||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||
Currently only works for /wall posts
|
||||
"""
|
||||
name = "vk"
|
||||
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
onclick_pattern = re.compile(r"({.*})")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: VkConfig):
|
||||
super().__init__(storage, driver)
|
||||
if config != None:
|
||||
self.vk_session = vk_api.VkApi(config.username, config.password)
|
||||
self.vk_session.auth(token_only=True)
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
# detect URLs that this archiver can handle
|
||||
_id, method = None, None
|
||||
if has_wall := self.wall_pattern.search(url):
|
||||
_id = has_wall[0]
|
||||
method = self.archive_wall
|
||||
elif has_photo := self.photo_pattern.search(url):
|
||||
_id = has_photo[0]
|
||||
method = self.archive_photo
|
||||
else: return False
|
||||
|
||||
logger.info(f"found valid {_id=} from {url=}")
|
||||
proper_url = f'https://vk.com/{_id}'
|
||||
|
||||
# if check if exists will not download again
|
||||
key = self.get_html_key(proper_url)
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
screenshot = self.get_screenshot(proper_url)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
|
||||
|
||||
try:
|
||||
return method(proper_url, _id)
|
||||
except Exception as e:
|
||||
logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}")
|
||||
return False
|
||||
|
||||
def archive_photo(self, photo_url, photo_id):
|
||||
headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version}
|
||||
req = requests.get("https://api.vk.com/method/photos.getById", headers)
|
||||
res = req.json()["response"][0]
|
||||
title = res["text"][:200] # more on the page
|
||||
img_url = res["orig_photo"]["url"]
|
||||
time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res)
|
||||
screenshot = self.get_screenshot(photo_url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
|
||||
|
||||
def archive_wall(self, wall_url, wall_id):
|
||||
headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version}
|
||||
req = requests.get("https://api.vk.com/method/wall.getById", headers)
|
||||
res = req.json()["response"]
|
||||
wall = res["items"][0]
|
||||
img_urls = []
|
||||
if "attachments" in wall:
|
||||
for a in wall["attachments"]:
|
||||
attachment = a[a["type"]]
|
||||
if "thumb" in attachment:
|
||||
attachment = attachment["thumb"]
|
||||
if "sizes" in attachment:
|
||||
try: img_urls.append(attachment["sizes"][-1]["url"])
|
||||
except Exception as e:
|
||||
logger.warning(f"could not get image from attachment: {e}")
|
||||
|
||||
|
||||
title = wall["text"][:200] # more on the page
|
||||
time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res)
|
||||
screenshot = self.get_screenshot(wall_url)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
|
|
@ -1,81 +1,88 @@
|
|||
import time, requests, os
|
||||
import time, requests
|
||||
|
||||
from loguru import logger
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
from loguru import logger
|
||||
from configs import WaybackConfig
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
"""
|
||||
This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}"
|
||||
but that might not be desirable since the webpage might have been archived a long time ago and thus have changed
|
||||
"""
|
||||
name = "wayback"
|
||||
|
||||
def __init__(self, storage: Storage, driver):
|
||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
||||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.config = config
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if check_if_exists and url in self.seen_urls:
|
||||
return self.seen_urls[url]
|
||||
if self.config is None:
|
||||
logger.error('Missing Wayback config')
|
||||
return False
|
||||
if check_if_exists:
|
||||
if url in self.seen_urls: return self.seen_urls[url]
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
logger.debug(f"POSTing {url=} to web.archive.org")
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
"Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
|
||||
"Authorization": f"LOW {self.config.key}:{self.config.secret}"
|
||||
}
|
||||
|
||||
r = requests.post(
|
||||
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
|
||||
|
||||
if 'job_id' not in r.json() and 'message' in r.json():
|
||||
logger.warning(f"Internet archive failed json \n {r.json()}")
|
||||
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||
return self.custom_retry(r.json(), screenshot=screenshot)
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
|
||||
status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
|
||||
logger.debug(f"GETting status for {job_id=} on {url=}")
|
||||
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||
retries = 0
|
||||
|
||||
# TODO: make the job queue parallel -> consider propagation of results back to sheet though
|
||||
# wait 90-120 seconds for the archive job to finish
|
||||
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
|
||||
time.sleep(3)
|
||||
|
||||
try:
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
|
||||
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||
except:
|
||||
time.sleep(1)
|
||||
|
||||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
|
||||
|
||||
status_json = status_r.json()
|
||||
|
||||
if status_json['status'] != 'success':
|
||||
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||
return self.custom_retry(status_json, screenshot=screenshot)
|
||||
|
||||
archive_url = 'https://web.archive.org/web/' + \
|
||||
status_json['timestamp'] + '/' + status_json['original_url']
|
||||
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
||||
|
||||
try:
|
||||
r = requests.get(archive_url)
|
||||
|
||||
parsed = BeautifulSoup(r.content, 'html.parser')
|
||||
|
||||
req = requests.get(archive_url)
|
||||
parsed = BeautifulSoup(req.content, 'html.parser')
|
||||
title = parsed.find_all('title')[0].text
|
||||
|
||||
if title == 'Wayback Machine':
|
||||
title = 'Could not get title'
|
||||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
self.seen_urls[url] = result
|
||||
return result
|
||||
self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
return self.seen_urls[url]
|
||||
|
||||
def custom_retry(self, json_data, **kwargs):
|
||||
logger.warning(f"Internet archive failed json \n {json_data}")
|
||||
if "please try again" in str(json_data).lower():
|
||||
return self.signal_retry_in(**kwargs)
|
||||
if "this host has been already captured" in str(json_data).lower():
|
||||
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
|
||||
return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
import os
|
||||
import datetime
|
||||
import os, datetime
|
||||
|
||||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
|
@ -10,7 +10,7 @@ from storages import Storage
|
|||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "youtube_dl"
|
||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||
|
||||
def __init__(self, storage: Storage, driver, fb_cookie):
|
||||
super().__init__(storage, driver)
|
||||
|
@ -18,7 +18,7 @@ class YoutubeDLArchiver(Archiver):
|
|||
|
||||
def download(self, url, check_if_exists=False):
|
||||
netloc = self.get_netloc(url)
|
||||
if netloc in ['facebook.com', 'www.facebook.com']:
|
||||
if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.fb_cookie
|
||||
|
||||
|
@ -106,11 +106,11 @@ class YoutubeDLArchiver(Archiver):
|
|||
|
||||
os.remove(filename)
|
||||
|
||||
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
|
||||
if 'timestamp' in info else \
|
||||
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
|
||||
if 'upload_date' in info and info['upload_date'] is not None else \
|
||||
None
|
||||
timestamp = None
|
||||
if 'timestamp' in info and info['timestamp'] is not None:
|
||||
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat()
|
||||
elif 'upload_date' in info and info['upload_date'] is not None:
|
||||
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)
|
||||
|
|
222
auto_archive.py
222
auto_archive.py
|
@ -1,30 +1,17 @@
|
|||
import os
|
||||
import datetime
|
||||
import argparse
|
||||
import requests
|
||||
import shutil
|
||||
import gspread
|
||||
import os, datetime, shutil, traceback, random
|
||||
|
||||
from loguru import logger
|
||||
from dotenv import load_dotenv
|
||||
from selenium import webdriver
|
||||
import traceback
|
||||
from slugify import slugify
|
||||
|
||||
import archivers
|
||||
from storages import S3Storage, S3Config
|
||||
from storages.gd_storage import GDConfig, GDStorage
|
||||
from utils import GWorksheet, mkdir_if_not_exists
|
||||
import sys
|
||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
|
||||
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
|
||||
logger.add("logs/1trace.log", level="TRACE")
|
||||
logger.add("logs/2info.log", level="INFO")
|
||||
logger.add("logs/3success.log", level="SUCCESS")
|
||||
logger.add("logs/4warning.log", level="WARNING")
|
||||
logger.add("logs/5error.log", level="ERROR")
|
||||
|
||||
load_dotenv()
|
||||
random.seed()
|
||||
|
||||
|
||||
def update_sheet(gw, row, result: archivers.ArchiveResult):
|
||||
def update_sheet(gw, row, result: ArchiveResult):
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
|
@ -37,8 +24,7 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
|
|||
|
||||
batch_if_valid('archive', result.cdn_url)
|
||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('thumbnail', result.thumbnail,
|
||||
f'=IMAGE("{result.thumbnail}")')
|
||||
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
||||
batch_if_valid('thumbnail_index', result.thumbnail_index)
|
||||
batch_if_valid('title', result.title)
|
||||
batch_if_valid('duration', result.duration, str(result.duration))
|
||||
|
@ -58,169 +44,107 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
|
|||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
|
||||
def expand_url(url):
|
||||
# expand short URL links
|
||||
if 'https://t.co/' in url:
|
||||
try:
|
||||
r = requests.get(url)
|
||||
url = r.url
|
||||
except:
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
return url
|
||||
def missing_required_columns(gw: GWorksheet):
|
||||
missing = False
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
|
||||
missing = True
|
||||
return missing
|
||||
|
||||
|
||||
def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
sh = gc.open(sheet)
|
||||
|
||||
s3_config = S3Config(
|
||||
bucket=os.getenv('DO_BUCKET'),
|
||||
region=os.getenv('DO_SPACES_REGION'),
|
||||
key=os.getenv('DO_SPACES_KEY'),
|
||||
secret=os.getenv('DO_SPACES_SECRET')
|
||||
)
|
||||
gd_config = GDConfig(
|
||||
root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'),
|
||||
)
|
||||
telegram_config = archivers.TelegramConfig(
|
||||
api_id=os.getenv('TELEGRAM_API_ID'),
|
||||
api_hash=os.getenv('TELEGRAM_API_HASH')
|
||||
)
|
||||
def process_sheet(c: Config):
|
||||
sh = c.gsheets_client.open(c.sheet)
|
||||
|
||||
# loop through worksheets to check
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}')
|
||||
gw = GWorksheet(wks, header_row=header, columns=columns)
|
||||
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
|
||||
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
|
||||
|
||||
if not gw.col_exists('url'):
|
||||
logger.info(
|
||||
f'No "{columns["url"]}" column found, skipping worksheet {wks.title}')
|
||||
continue
|
||||
if missing_required_columns(gw): continue
|
||||
|
||||
if not gw.col_exists('status'):
|
||||
logger.info(
|
||||
f'No "{columns["status"]}" column found, skipping worksheet {wks.title}')
|
||||
continue
|
||||
|
||||
# archives will be in a folder 'doc_name/worksheet_name'
|
||||
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
|
||||
s3_client = S3Storage(s3_config)
|
||||
|
||||
gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
|
||||
gd_client = GDStorage(gd_config)
|
||||
# archives will default to being in a folder 'doc_name/worksheet_name'
|
||||
default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
|
||||
c.set_folder(default_folder)
|
||||
storage = c.get_storage()
|
||||
|
||||
# loop through rows in worksheet
|
||||
for row in range(1 + header, gw.count_rows() + 1):
|
||||
for row in range(1 + c.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url')
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
|
||||
|
||||
if url != '' and status in ['', None]:
|
||||
is_retry = False
|
||||
if url == '' or status not in ['', None]:
|
||||
is_retry = Archiver.should_retry_from_status(status)
|
||||
if not is_retry: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
try:
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
||||
url = expand_url(url)
|
||||
|
||||
subfolder = gw.get_cell_or_default(row, 'subfolder')
|
||||
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
||||
|
||||
# make a new driver so each spreadsheet row is idempotent
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
|
||||
driver = webdriver.Firefox(options=options)
|
||||
driver.set_window_size(1400, 2000)
|
||||
# in seconds, telegram screenshots catch which don't come back
|
||||
driver.set_page_load_timeout(120)
|
||||
|
||||
# client
|
||||
storage_client = None
|
||||
if storage == "s3":
|
||||
storage_client = s3_client
|
||||
elif storage == "gd":
|
||||
storage_client = gd_client
|
||||
else:
|
||||
raise ValueError(f'Cant get storage_client {storage_client}')
|
||||
storage_client.update_properties(subfolder=subfolder)
|
||||
c.recreate_webdriver()
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
archivers.TelethonArchiver(storage_client, driver, telegram_config),
|
||||
archivers.TelegramArchiver(storage_client, driver),
|
||||
archivers.TiktokArchiver(storage_client, driver),
|
||||
archivers.YoutubeDLArchiver(storage_client, driver, os.getenv('FACEBOOK_COOKIE')),
|
||||
archivers.TwitterArchiver(storage_client, driver),
|
||||
archivers.WaybackArchiver(storage_client, driver)
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
||||
TiktokArchiver(storage, c.webdriver),
|
||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TwitterArchiver(storage, c.webdriver),
|
||||
VkArchiver(storage, c.webdriver, c.vk_config),
|
||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
||||
]
|
||||
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on row {row}')
|
||||
logger.debug(f'Trying {archiver} on {row=}')
|
||||
|
||||
try:
|
||||
result = archiver.download(url, check_if_exists=True)
|
||||
except KeyboardInterrupt:
|
||||
logger.warning("caught interrupt")
|
||||
gw.set_cell(row, 'status', '')
|
||||
driver.quit()
|
||||
exit()
|
||||
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
||||
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
||||
except Exception as e:
|
||||
result = False
|
||||
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}')
|
||||
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||
|
||||
if result:
|
||||
# IA is a Success I believe - or do we want to display a logger warning for it?
|
||||
if result.status in ['success', 'already archived', 'Internet Archive fallback']:
|
||||
result.status = archiver.name + \
|
||||
": " + str(result.status)
|
||||
logger.success(
|
||||
f'{archiver} succeeded on row {row}, url {url}')
|
||||
success = result.status in ['success', 'already archived']
|
||||
result.status = f"{archiver.name}: {result.status}"
|
||||
if success:
|
||||
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
|
||||
break
|
||||
# only 1 retry possible for now
|
||||
if is_retry and Archiver.is_retry(result.status):
|
||||
result.status = Archiver.remove_retry(result.status)
|
||||
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
|
||||
|
||||
# wayback has seen this url before so keep existing status
|
||||
if "wayback: Internet Archive fallback" in result.status:
|
||||
logger.success(
|
||||
f'wayback has seen this url before so keep existing status on row {row}')
|
||||
result.status = result.status.replace(' (duplicate)', '')
|
||||
result.status = str(result.status) + " (duplicate)"
|
||||
break
|
||||
|
||||
logger.warning(
|
||||
f'{archiver} did not succeed on {row=}, final status: {result.status}')
|
||||
result.status = archiver.name + \
|
||||
": " + str(result.status)
|
||||
# get rid of driver so can reload on next row
|
||||
driver.quit()
|
||||
if result:
|
||||
update_sheet(gw, row, result)
|
||||
else:
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
logger.success(f'Finshed worksheet {wks.title}')
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
logger.warning(f"caught interrupt on {row=}, {url=}")
|
||||
gw.set_cell(row, 'status', '')
|
||||
c.destroy_webdriver()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
|
||||
gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
|
||||
@logger.catch
|
||||
def main():
|
||||
logger.debug(f'Passed args:{sys.argv}')
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Automatically archive social media videos from a Google Sheets document')
|
||||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True)
|
||||
parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row')
|
||||
parser.add_argument('--private', action='store_true', help='Store content without public access permission')
|
||||
|
||||
parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"})
|
||||
|
||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||
help = f"the name of the column to fill with {k} (defaults={v})"
|
||||
if k == "subfolder":
|
||||
help = f"the name of the column to read the {k} from (defaults={v})"
|
||||
parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help)
|
||||
|
||||
args = parser.parse_args()
|
||||
config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()}
|
||||
|
||||
logger.info(f'Opening document {args.sheet} for header {args.header} and storage {args.storage}')
|
||||
|
||||
mkdir_if_not_exists('tmp')
|
||||
process_sheet(args.sheet, args.storage, args.header, config_columns)
|
||||
shutil.rmtree('tmp')
|
||||
c = Config()
|
||||
c.parse()
|
||||
logger.info(f'Opening document {c.sheet} for header {c.header}')
|
||||
mkdir_if_not_exists(Storage.TMP_FOLDER)
|
||||
process_sheet(c)
|
||||
c.destroy_webdriver()
|
||||
shutil.rmtree(Storage.TMP_FOLDER)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,29 +1,30 @@
|
|||
import gspread
|
||||
import argparse
|
||||
import shutil
|
||||
import auto_archive
|
||||
from loguru import logger
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Automatically use youtube-dl to download media from a Google Sheet")
|
||||
parser.add_argument("--sheet", action="store", dest="sheet")
|
||||
c = Config()
|
||||
c.parse()
|
||||
logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Opening document " + args.sheet)
|
||||
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
sh = gc.open(args.sheet)
|
||||
gc = c.gsheets_client
|
||||
sh = gc.open(c.sheet)
|
||||
|
||||
wks = sh.get_worksheet(0)
|
||||
values = wks.get_all_values()
|
||||
|
||||
mkdir_if_not_exists(Storage.TMP_FOLDER)
|
||||
for i in range(11, len(values)):
|
||||
sheet_name = values[i][0]
|
||||
c.sheet = values[i][0]
|
||||
logger.info(f"Processing {c.sheet}")
|
||||
auto_archive.process_sheet(c)
|
||||
c.destroy_webdriver()
|
||||
shutil.rmtree(Storage.TMP_FOLDER)
|
||||
|
||||
logger.info("Processing " + sheet_name)
|
||||
|
||||
auto_archive.process_sheet(sheet_name)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
from .config import Config
|
||||
from .selenium_config import SeleniumConfig
|
||||
from .telethon_config import TelethonConfig
|
||||
from .wayback_config import WaybackConfig
|
||||
from .vk_config import VkConfig
|
|
@ -0,0 +1,252 @@
|
|||
|
||||
import argparse, yaml, json
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from selenium import webdriver
|
||||
from dataclasses import asdict
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from utils import GWorksheet, getattr_or
|
||||
from .wayback_config import WaybackConfig
|
||||
from .telethon_config import TelethonConfig
|
||||
from .selenium_config import SeleniumConfig
|
||||
from .vk_config import VkConfig
|
||||
from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
|
||||
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Controls the current execution parameters and manages API configurations
|
||||
Usage:
|
||||
c = Config() # initializes the argument parser
|
||||
c.parse() # parses the values and initializes the Services and API clients
|
||||
# you can then access the Services and APIs like 'c.s3_config'
|
||||
All the configurations available as cmd line options, when included, will
|
||||
override the configurations in the config.yaml file.
|
||||
Configurations are split between:
|
||||
1. "secrets" containing API keys for generating services - not kept in memory
|
||||
2. "execution" containing specific execution configurations
|
||||
"""
|
||||
AVAILABLE_STORAGES = {"s3", "gd", "local"}
|
||||
|
||||
def __init__(self):
|
||||
self.parser = self.get_argument_parser()
|
||||
self.folder = ""
|
||||
|
||||
def parse(self):
|
||||
self.args = self.parser.parse_args()
|
||||
logger.success(f'Command line arguments parsed successfully')
|
||||
self.config_file = self.args.config
|
||||
self.read_config_yaml()
|
||||
logger.info(f'APIs and Services initialized:\n{self}')
|
||||
|
||||
def read_config_yaml(self):
|
||||
with open(self.config_file, "r", encoding="utf-8") as inf:
|
||||
self.config = yaml.safe_load(inf)
|
||||
|
||||
# ----------------------EXECUTION - execution configurations
|
||||
execution = self.config.get("execution", {})
|
||||
|
||||
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
|
||||
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
||||
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
|
||||
Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER)
|
||||
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
|
||||
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
|
||||
if self.save_logs:
|
||||
self.set_log_files()
|
||||
self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False)
|
||||
|
||||
# Column names come from config and can be overwritten by CMD
|
||||
# in the end all are considered as lower case
|
||||
config_column_names = execution.get("column_names", {})
|
||||
self.column_names = {}
|
||||
for k in GWorksheet.COLUMN_NAMES.keys():
|
||||
self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
|
||||
|
||||
# selenium driver
|
||||
selenium_configs = execution.get("selenium", {})
|
||||
self.selenium_config = SeleniumConfig(
|
||||
timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
|
||||
window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
|
||||
window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
|
||||
)
|
||||
self.webdriver = "not initialized"
|
||||
|
||||
# ---------------------- SECRETS - APIs and service configurations
|
||||
secrets = self.config.get("secrets", {})
|
||||
|
||||
# assert selected storage credentials exist
|
||||
for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]:
|
||||
assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
|
||||
|
||||
# google sheets config
|
||||
self.gsheets_client = gspread.service_account(
|
||||
filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
|
||||
)
|
||||
|
||||
# facebook config
|
||||
self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
|
||||
|
||||
# s3 config
|
||||
if "s3" in secrets:
|
||||
s3 = secrets["s3"]
|
||||
self.s3_config = S3Config(
|
||||
bucket=s3["bucket"],
|
||||
region=s3["region"],
|
||||
key=s3["key"],
|
||||
secret=s3["secret"],
|
||||
endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
|
||||
cdn_url=s3.get("cdn_url", S3Config.cdn_url),
|
||||
key_path=s3.get("key_path", S3Config.key_path),
|
||||
private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private))
|
||||
)
|
||||
|
||||
# GDrive config
|
||||
if "google_drive" in secrets:
|
||||
gd = secrets["google_drive"]
|
||||
self.gd_config = GDConfig(
|
||||
root_folder_id=gd.get("root_folder_id"),
|
||||
service_account=gd.get("service_account", GDConfig.service_account)
|
||||
)
|
||||
|
||||
if "local" in secrets:
|
||||
self.local_config = LocalConfig(
|
||||
save_to=secrets["local"].get("save_to", LocalConfig.save_to),
|
||||
)
|
||||
|
||||
# wayback machine config
|
||||
if "wayback" in secrets:
|
||||
self.wayback_config = WaybackConfig(
|
||||
key=secrets["wayback"]["key"],
|
||||
secret=secrets["wayback"]["secret"],
|
||||
)
|
||||
else:
|
||||
self.wayback_config = None
|
||||
logger.debug(f"'wayback' key not present in the {self.config_file=}")
|
||||
|
||||
# telethon config
|
||||
if "telegram" in secrets:
|
||||
self.telegram_config = TelethonConfig(
|
||||
api_id=secrets["telegram"]["api_id"],
|
||||
api_hash=secrets["telegram"]["api_hash"],
|
||||
bot_token=secrets["telegram"].get("bot_token", None)
|
||||
)
|
||||
else:
|
||||
self.telegram_config = None
|
||||
logger.debug(f"'telegram' key not present in the {self.config_file=}")
|
||||
|
||||
# vk config
|
||||
if "vk" in secrets:
|
||||
self.vk_config = VkConfig(
|
||||
username=secrets["vk"]["username"],
|
||||
password=secrets["vk"]["password"]
|
||||
)
|
||||
else:
|
||||
self.vk_config = None
|
||||
logger.debug(f"'vk' key not present in the {self.config_file=}")
|
||||
|
||||
del self.config["secrets"] # delete to prevent leaks
|
||||
|
||||
def set_log_files(self):
|
||||
# called only when config.execution.save_logs=true
|
||||
logger.add("logs/1trace.log", level="TRACE")
|
||||
logger.add("logs/2info.log", level="INFO")
|
||||
logger.add("logs/3success.log", level="SUCCESS")
|
||||
logger.add("logs/4warning.log", level="WARNING")
|
||||
logger.add("logs/5error.log", level="ERROR")
|
||||
|
||||
def get_argument_parser(self):
|
||||
"""
|
||||
Creates the CMD line arguments. 'python auto_archive.py --help'
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
|
||||
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
|
||||
parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]')
|
||||
parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]')
|
||||
parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]')
|
||||
parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]')
|
||||
|
||||
for k, v in GWorksheet.COLUMN_NAMES.items():
|
||||
help = f"the name of the column to FILL WITH {k} (default='{v}')"
|
||||
if k in ["url", "folder"]:
|
||||
help = f"the name of the column to READ {k} FROM (default='{v}')"
|
||||
parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
|
||||
|
||||
return parser
|
||||
|
||||
def set_folder(self, folder):
|
||||
"""
|
||||
update the folder in each of the storages
|
||||
"""
|
||||
self.folder = folder
|
||||
# s3
|
||||
if hasattr(self, "s3_config"): self.s3_config.folder = folder
|
||||
if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
|
||||
# gdrive
|
||||
if hasattr(self, "gd_config"): self.gd_config.folder = folder
|
||||
if hasattr(self, "gd_storage"): self.gd_storage.folder = folder
|
||||
# local
|
||||
if hasattr(self, "local_config"): self.local_config.folder = folder
|
||||
if hasattr(self, "local_storage"): self.local_storage.folder = folder
|
||||
|
||||
def get_storage(self):
|
||||
"""
|
||||
returns the configured type of storage, creating if needed
|
||||
"""
|
||||
if self.storage == "s3":
|
||||
self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config))
|
||||
return self.s3_storage
|
||||
elif self.storage == "gd":
|
||||
self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config))
|
||||
return self.gd_storage
|
||||
elif self.storage == "local":
|
||||
self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config))
|
||||
return self.local_storage
|
||||
raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
|
||||
|
||||
def destroy_webdriver(self):
|
||||
if self.webdriver is not None and type(self.webdriver) != str:
|
||||
self.webdriver.quit()
|
||||
del self.webdriver
|
||||
|
||||
def recreate_webdriver(self):
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
try:
|
||||
new_webdriver = webdriver.Firefox(options=options)
|
||||
# only destroy if creation is successful
|
||||
self.destroy_webdriver()
|
||||
self.webdriver = new_webdriver
|
||||
self.webdriver.set_window_size(self.selenium_config.window_width,
|
||||
self.selenium_config.window_height)
|
||||
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
|
||||
except TimeoutException as e:
|
||||
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
|
||||
|
||||
|
||||
def __str__(self) -> str:
|
||||
return json.dumps({
|
||||
"config_file": self.config_file,
|
||||
"sheet": self.sheet,
|
||||
"storage": self.storage,
|
||||
"header": self.header,
|
||||
"check_if_exists": self.check_if_exists,
|
||||
"save_logs": self.save_logs,
|
||||
"tmp_folder": Storage.TMP_FOLDER,
|
||||
"selenium_config": asdict(self.selenium_config),
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
"s3_config": hasattr(self, "s3_config"),
|
||||
"s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
|
||||
"gd_config": hasattr(self, "gd_config"),
|
||||
"local_config": hasattr(self, "local_config"),
|
||||
"wayback_config": self.wayback_config != None,
|
||||
"telegram_config": self.telegram_config != None,
|
||||
"vk_config": self.vk_config != None,
|
||||
"gsheets_client": self.gsheets_client != None,
|
||||
"column_names": self.column_names,
|
||||
}, ensure_ascii=False, indent=4)
|
|
@ -0,0 +1,8 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeleniumConfig:
|
||||
timeout_seconds: int = 120
|
||||
window_width: int = 1400
|
||||
window_height: int = 2000
|
|
@ -0,0 +1,9 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TelethonConfig:
|
||||
api_id: str
|
||||
api_hash: str
|
||||
bot_token: str
|
|
@ -0,0 +1,8 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class VkConfig:
|
||||
username: str
|
||||
password: str
|
|
@ -0,0 +1,8 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class WaybackConfig:
|
||||
key: str
|
||||
secret: str
|
|
@ -0,0 +1,88 @@
|
|||
---
|
||||
secrets:
|
||||
# needed if you use storage=s3
|
||||
s3:
|
||||
# contains S3 info on region, bucket, key and secret
|
||||
region: reg1
|
||||
bucket: my-bucket
|
||||
key: "s3 API key"
|
||||
secret: "s3 API secret"
|
||||
# use region format like such
|
||||
endpoint_url: 'https://{region}.digitaloceanspaces.com'
|
||||
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
private: false
|
||||
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
||||
key_path: random
|
||||
|
||||
# needed if you use storage=gd
|
||||
google_drive:
|
||||
# local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json
|
||||
service_account: "service_account.json"
|
||||
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
|
||||
|
||||
# needed if you use storage=local
|
||||
local:
|
||||
# local path to save files in
|
||||
save_to: "./local_archive"
|
||||
|
||||
wayback:
|
||||
# to get credentials visit https://archive.org/account/s3.php
|
||||
key: your API key
|
||||
secret: your API secret
|
||||
|
||||
telegram:
|
||||
# to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
|
||||
api_id: your API key, see
|
||||
api_hash: your API hash
|
||||
# optional, but allows access to more content such as large videos, talk to @botfather
|
||||
bot_token: your bot-token
|
||||
|
||||
# vkontakte (vk.com) credentials
|
||||
vk:
|
||||
username: "phone number or email"
|
||||
password: "password"
|
||||
|
||||
google_sheets:
|
||||
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
|
||||
service_account: "service_account.json"
|
||||
|
||||
facebook:
|
||||
# optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
|
||||
cookie: ""
|
||||
execution:
|
||||
# can be overwritten with CMD --sheet=
|
||||
sheet: your-sheet-name
|
||||
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
||||
header: 1
|
||||
# which storage to use, can be overwritten with CMD --storage=
|
||||
storage: s3
|
||||
# defaults to false, when true will try to avoid duplicate URL archives
|
||||
check_if_exists: true
|
||||
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
||||
selenium:
|
||||
# values under 10s might mean screenshots fail to grab screenshot
|
||||
timeout_seconds: 120
|
||||
window_width: 1400
|
||||
window_height: 2000
|
||||
# local tmp folder to save files before uploading to storage
|
||||
tmp_folder: tmp/
|
||||
# puts execution logs into /logs folder, defaults to false
|
||||
save_logs: true
|
||||
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
||||
# url and status are the only columns required to be present in the google sheet
|
||||
column_names:
|
||||
url: link
|
||||
status: archive status
|
||||
archive: archive location
|
||||
# use this column to override default location data
|
||||
folder: folder
|
||||
date: archive date
|
||||
thumbnail: thumbnail
|
||||
thumbnail_index: thumbnail index
|
||||
timestamp: upload timestamp
|
||||
title: upload title
|
||||
duration: duration
|
||||
screenshot: screenshot
|
||||
hash: hash
|
|
@ -1,3 +1,5 @@
|
|||
# we need to explicitly expose the available imports here
|
||||
from .base_storage import *
|
||||
from .s3_storage import *
|
||||
from .base_storage import Storage
|
||||
from .local_storage import LocalStorage, LocalConfig
|
||||
from .s3_storage import S3Config, S3Storage
|
||||
from .gd_storage import GDConfig, GDStorage
|
|
@ -4,6 +4,8 @@ from pathlib import Path
|
|||
|
||||
|
||||
class Storage(ABC):
|
||||
TMP_FOLDER = "tmp/"
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, config): pass
|
||||
|
||||
|
@ -20,25 +22,3 @@ class Storage(ABC):
|
|||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
with open(filename, 'rb') as f:
|
||||
self.uploadf(f, key, **kwargs)
|
||||
|
||||
def update_properties(self, **kwargs):
|
||||
"""
|
||||
method used to update general properties that some children may use
|
||||
and others not, but that all can call
|
||||
"""
|
||||
for k, v in kwargs.items():
|
||||
if k in self.get_allowed_properties():
|
||||
setattr(self, k, v)
|
||||
else:
|
||||
logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"')
|
||||
|
||||
def get_allowed_properties(self):
|
||||
"""
|
||||
child classes should specify which properties they allow to be set
|
||||
"""
|
||||
return set(["subfolder"])
|
||||
|
||||
def clean_path(self, folder, default="", add_forward_slash=True):
|
||||
if folder is None or type(folder) != str or len(folder.strip()) == 0:
|
||||
return default
|
||||
return str(Path(folder)) + ("/" if add_forward_slash else "")
|
||||
|
|
|
@ -1,26 +1,26 @@
|
|||
import os, time
|
||||
|
||||
from loguru import logger
|
||||
from .base_storage import Storage
|
||||
from dataclasses import dataclass
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaFileUpload
|
||||
from google.oauth2 import service_account
|
||||
|
||||
import time
|
||||
|
||||
|
||||
@dataclass
|
||||
class GDConfig:
|
||||
root_folder_id: str
|
||||
folder: str = "default"
|
||||
service_account: str = "service_account.json"
|
||||
|
||||
|
||||
class GDStorage(Storage):
|
||||
DEFAULT_UPLOAD_FOLDER_NAME = "default"
|
||||
|
||||
def __init__(self, config: GDConfig):
|
||||
self.folder = config.folder
|
||||
self.root_folder_id = config.root_folder_id
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES)
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
config.service_account, scopes=['https://www.googleapis.com/auth/drive'])
|
||||
self.service = build('drive', 'v3', credentials=creds)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
|
@ -28,160 +28,111 @@ class GDStorage(Storage):
|
|||
only support files saved in a folder for GD
|
||||
S3 supports folder and all stored in the root
|
||||
"""
|
||||
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
filename = key
|
||||
logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD')
|
||||
|
||||
# retry policy on Google Drive
|
||||
try_again = True
|
||||
counter = 1
|
||||
folder_id = None
|
||||
while try_again:
|
||||
# need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url
|
||||
results = self.service.files().list(
|
||||
q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"found folder of {item['name']}")
|
||||
folder_id = item['id']
|
||||
try_again = False
|
||||
|
||||
if folder_id is None:
|
||||
logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}')
|
||||
counter += 1
|
||||
time.sleep(10)
|
||||
if counter > 18:
|
||||
raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes')
|
||||
|
||||
# check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html'
|
||||
# happens doing thumbnails
|
||||
a, _, b = filename.partition('/')
|
||||
|
||||
if b != '':
|
||||
# a: 'youtube_dl_sDE-qZdi8p8'
|
||||
# b: 'index.html'
|
||||
logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}')
|
||||
|
||||
# get id of the sub folder
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
filename = None
|
||||
for item in items:
|
||||
folder_id = item['id']
|
||||
filename = b
|
||||
if filename is None:
|
||||
raise ValueError(f'Problem finding sub folder {a}')
|
||||
full_name = os.path.join(self.folder, key)
|
||||
parent_id, folder_id = self.root_folder_id, None
|
||||
path_parts = full_name.split(os.path.sep)
|
||||
filename = path_parts[-1]
|
||||
logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
|
||||
for folder in path_parts[0:-1]:
|
||||
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||
parent_id = folder_id
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id}' in parents and name = '{filename}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
file_id = self._get_id_from_parent_and_name(folder_id, filename)
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
file_id = None
|
||||
for item in items:
|
||||
logger.debug(f"found file of {item['name']}")
|
||||
file_id = item['id']
|
||||
def exists(self, key):
|
||||
try:
|
||||
self.get_cdn_url(key)
|
||||
return True
|
||||
except: return False
|
||||
|
||||
if file_id is None:
|
||||
raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}')
|
||||
|
||||
foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing"
|
||||
return foo
|
||||
|
||||
def exists(self, _key):
|
||||
# TODO: How to check for google drive, as it accepts different names
|
||||
return False
|
||||
|
||||
def uploadf(self, file, key, **_kwargs):
|
||||
logger.debug(f"before {self.subfolder=}")
|
||||
self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False)
|
||||
filename = key
|
||||
logger.debug(f"after {self.subfolder=}")
|
||||
# does folder eg SM0005 exist already inside parent of Files auto-archiver
|
||||
results = self.service.files().list(
|
||||
q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ",
|
||||
spaces='drive',
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway')
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"Found existing folder of {item['name']}")
|
||||
folder_id_to_upload_to = item['id']
|
||||
|
||||
if folder_id_to_upload_to is None:
|
||||
logger.debug(f'Creating new folder {self.subfolder}')
|
||||
file_metadata = {
|
||||
'name': [self.subfolder],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [self.root_folder_id]
|
||||
}
|
||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
folder_id_to_upload_to = gd_file.get('id')
|
||||
|
||||
# check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails
|
||||
# will always return a and a blank b even if there is nothing to split
|
||||
# https://stackoverflow.com/a/38149500/26086
|
||||
a, _, b = filename.partition('/')
|
||||
|
||||
if b != '':
|
||||
# a: 'youtube_dl_sDE-qZdi8p8'
|
||||
# b: 'out1.jpg'
|
||||
logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}')
|
||||
|
||||
# does the 'a' folder exist already in folder_id_to_upload_to eg SM0005
|
||||
results = self.service.files().list(
|
||||
q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ",
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
sub_folder_id_to_upload_to = None
|
||||
if len(items) > 1:
|
||||
logger.error(f'Duplicate folder name of {a} which should never happen')
|
||||
|
||||
for item in items:
|
||||
logger.debug(f"Found existing folder of {item['name']}")
|
||||
sub_folder_id_to_upload_to = item['id']
|
||||
|
||||
if sub_folder_id_to_upload_to is None:
|
||||
# create new folder
|
||||
file_metadata = {
|
||||
'name': [a],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [folder_id_to_upload_to]
|
||||
}
|
||||
gd_file = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
sub_folder_id_to_upload_to = gd_file.get('id')
|
||||
|
||||
filename = b
|
||||
folder_id_to_upload_to = sub_folder_id_to_upload_to
|
||||
# back to normal control flow
|
||||
def uploadf(self, file: str, key: str, **_kwargs):
|
||||
"""
|
||||
1. for each sub-folder in the path check if exists or create
|
||||
2. upload file to root_id/other_paths.../filename
|
||||
"""
|
||||
full_name = os.path.join(self.folder, key)
|
||||
parent_id, upload_to = self.root_folder_id, None
|
||||
path_parts = full_name.split(os.path.sep)
|
||||
filename = path_parts[-1]
|
||||
logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
|
||||
for folder in path_parts[0:-1]:
|
||||
upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
||||
if upload_to is None:
|
||||
upload_to = self._mkdir(folder, parent_id)
|
||||
parent_id = upload_to
|
||||
|
||||
# upload file to gd
|
||||
logger.debug(f'uploading {filename=} to folder id {upload_to}')
|
||||
file_metadata = {
|
||||
'name': [filename],
|
||||
'parents': [folder_id_to_upload_to]
|
||||
'parents': [upload_to]
|
||||
}
|
||||
media = MediaFileUpload(file, resumable=True)
|
||||
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
||||
logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
|
||||
|
||||
def upload(self, filename: str, key: str, **kwargs):
|
||||
# GD only requires the filename not a file reader
|
||||
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
|
||||
self.uploadf(filename, key, **kwargs)
|
||||
|
||||
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True):
|
||||
"""
|
||||
Retrieves the id of a folder or file from its @name and the @parent_id folder
|
||||
Optionally does multiple @retries and sleeps @sleep_seconds between them
|
||||
If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
|
||||
If @raise_on_missing will throw error when not found, or returns None
|
||||
Will remember previous calls to avoid duplication if @use_cache
|
||||
Returns the id of the file or folder from its name as a string
|
||||
"""
|
||||
# cache logic
|
||||
if use_cache:
|
||||
self.api_cache = getattr(self, "api_cache", {})
|
||||
cache_key = f"{parent_id}_{name}_{use_mime_type}"
|
||||
if cache_key in self.api_cache:
|
||||
logger.debug(f"cache hit for {cache_key=}")
|
||||
return self.api_cache[cache_key]
|
||||
|
||||
# API logic
|
||||
debug_header: str = f"[searching {name=} in {parent_id=}]"
|
||||
query_string = f"'{parent_id}' in parents and name = '{name}' "
|
||||
if use_mime_type:
|
||||
query_string += f" and mimeType='application/vnd.google-apps.folder' "
|
||||
|
||||
for attempt in range(retries):
|
||||
results = self.service.files().list(
|
||||
q=query_string,
|
||||
spaces='drive', # ie not appDataFolder or photos
|
||||
fields='files(id, name)'
|
||||
).execute()
|
||||
items = results.get('files', [])
|
||||
|
||||
if len(items) > 0:
|
||||
logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
|
||||
_id = items[-1]['id']
|
||||
if use_cache: self.api_cache[cache_key] = _id
|
||||
return _id
|
||||
else:
|
||||
logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
|
||||
if attempt < retries - 1:
|
||||
logger.debug(f'sleeping for {sleep_seconds} second(s)')
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
if raise_on_missing:
|
||||
raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
|
||||
return None
|
||||
|
||||
def _mkdir(self, name: str, parent_id: str):
|
||||
"""
|
||||
Creates a new GDrive folder @name inside folder @parent_id
|
||||
Returns id of the created folder
|
||||
"""
|
||||
logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
|
||||
file_metadata = {
|
||||
'name': [name],
|
||||
'mimeType': 'application/vnd.google-apps.folder',
|
||||
'parents': [parent_id]
|
||||
}
|
||||
gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
|
||||
return gd_folder.get('id')
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .base_storage import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalConfig:
|
||||
folder: str = ""
|
||||
save_to: str = "./"
|
||||
|
||||
class LocalStorage(Storage):
|
||||
def __init__(self, config:LocalConfig):
|
||||
self.folder = config.folder
|
||||
self.save_to = config.save_to
|
||||
mkdir_if_not_exists(self.save_to)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
full_path = os.path.join(self.save_to, self.folder, key)
|
||||
mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
|
||||
return os.path.abspath(full_path)
|
||||
|
||||
def exists(self, key):
|
||||
return os.path.isfile(self.get_cdn_url(key))
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
path = self.get_cdn_url(key)
|
||||
with open(path, "wb") as outf:
|
||||
outf.write(file.read())
|
|
@ -1,5 +1,9 @@
|
|||
import uuid, os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import boto3
|
||||
from botocore.errorfactory import ClientError
|
||||
|
||||
from .base_storage import Storage
|
||||
from dataclasses import dataclass
|
||||
from loguru import logger
|
||||
|
@ -12,30 +16,47 @@ class S3Config:
|
|||
key: str
|
||||
secret: str
|
||||
folder: str = ""
|
||||
endpoint_url: str = "https://{region}.digitaloceanspaces.com"
|
||||
cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
private: bool = False
|
||||
|
||||
key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
|
||||
|
||||
class S3Storage(Storage):
|
||||
|
||||
def __init__(self, config: S3Config):
|
||||
self.bucket = config.bucket
|
||||
self.region = config.region
|
||||
self.folder = self.clean_path(config.folder)
|
||||
self.folder = config.folder
|
||||
self.private = config.private
|
||||
self.cdn_url = config.cdn_url
|
||||
self.key_path = config.key_path
|
||||
self.key_dict = {}
|
||||
|
||||
self.s3 = boto3.client(
|
||||
's3',
|
||||
region_name=self.region,
|
||||
endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
|
||||
region_name=config.region,
|
||||
endpoint_url=config.endpoint_url.format(region=config.region),
|
||||
aws_access_key_id=config.key,
|
||||
aws_secret_access_key=config.secret
|
||||
)
|
||||
|
||||
def _get_path(self, key):
|
||||
return self.folder + self.clean_path(self.subfolder) + key
|
||||
"""
|
||||
Depends on the self.key_path configuration:
|
||||
* random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID
|
||||
* default -> defaults to self.folder/key
|
||||
"""
|
||||
# defaults to /key
|
||||
final_key = key
|
||||
if self.key_path == "random":
|
||||
if key not in self.key_dict:
|
||||
ext = os.path.splitext(key)[1]
|
||||
self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
|
||||
final_key = self.key_dict[key]
|
||||
return os.path.join(self.folder, final_key)
|
||||
|
||||
def get_cdn_url(self, key):
|
||||
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
|
||||
|
||||
def exists(self, key):
|
||||
try:
|
||||
|
@ -45,7 +66,6 @@ class S3Storage(Storage):
|
|||
return False
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
logger.debug(f'[S3 storage] uploading {file=}, {key=}')
|
||||
if self.private:
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
else:
|
||||
|
|
|
@ -10,10 +10,10 @@ class GWorksheet:
|
|||
"""
|
||||
COLUMN_NAMES = {
|
||||
'url': 'link',
|
||||
'subfolder': 'sub folder',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'status': 'archive status',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
|
@ -25,9 +25,12 @@ class GWorksheet:
|
|||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
self.wks = worksheet
|
||||
self.values = self.wks.get_values()
|
||||
self.headers = [v.lower() for v in self.values[header_row - 1]]
|
||||
self.columns = columns
|
||||
self.values = self.wks.get_values()
|
||||
if len(self.values) > 0:
|
||||
self.headers = [v.lower() for v in self.values[header_row - 1]]
|
||||
else:
|
||||
self.headers = []
|
||||
|
||||
def _check_col_exists(self, col: str):
|
||||
if col not in self.columns:
|
||||
|
@ -69,12 +72,15 @@ class GWorksheet:
|
|||
return ''
|
||||
return row[col_index]
|
||||
|
||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False):
|
||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
||||
"""
|
||||
return self.get_cell or default value on error (eg: column is missing)
|
||||
"""
|
||||
try:
|
||||
return self.get_cell(row, col, fresh)
|
||||
val = self.get_cell(row, col, fresh)
|
||||
if when_empty_use_default and val.strip() == "":
|
||||
return default
|
||||
return val
|
||||
except:
|
||||
return default
|
||||
|
||||
|
|
|
@ -1,5 +1,28 @@
|
|||
import os
|
||||
|
||||
import os, sys, requests
|
||||
from loguru import logger
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
if not os.path.exists(folder):
|
||||
os.mkdir(folder)
|
||||
os.makedirs(folder)
|
||||
|
||||
|
||||
def expand_url(url):
|
||||
# expand short URL links
|
||||
if 'https://t.co/' in url:
|
||||
try:
|
||||
r = requests.get(url)
|
||||
logger.debug(f'Expanded url {url} to {r.url}')
|
||||
return r.url
|
||||
except:
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
return url
|
||||
|
||||
def getattr_or(o: object, prop: str, default = None):
|
||||
try:
|
||||
res = getattr(o, prop)
|
||||
if res is None: raise
|
||||
return res
|
||||
except:
|
||||
return default
|
Ładowanie…
Reference in New Issue