From 1eb17e4de5039e82e1fe9afcabd98c1abf2a4e2e Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 25 Feb 2022 13:54:40 +0100 Subject: [PATCH] Add hash and screenshot methods; switch to more recent ytdl fork --- Pipfile | 4 +- Pipfile.lock | 412 ++++++++++++++++++++++++++++++-- archivers/base_archiver.py | 29 ++- archivers/telegram_archiver.py | 8 +- archivers/tiktok_archiver.py | 6 +- archivers/wayback_archiver.py | 7 +- archivers/youtubedl_archiver.py | 15 +- 7 files changed, 444 insertions(+), 37 deletions(-) diff --git a/Pipfile b/Pipfile index 27071fa..23af1e8 100644 --- a/Pipfile +++ b/Pipfile @@ -7,13 +7,15 @@ name = "pypi" gspread = "*" boto3 = "*" python-dotenv = "*" -youtube_dl = "*" argparse = "*" beautifulsoup4 = "*" tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} bs4 = "*" loguru = "*" ffmpeg-python = "*" +selenium = "*" +snscrape = "*" +yt-dlp = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 9879884..f456cd9 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be" + "sha256": "0d910665da5b5d8da7ab3f03bab399cf615aaab5c036c0eb82a0c16e105cebbe" }, "pipfile-spec": 6, "requires": { @@ -24,6 +24,22 @@ "index": "pypi", "version": "==1.4.0" }, + "async-generator": { + "hashes": [ + "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b", + "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144" + ], + "markers": "python_version >= '3.5'", + "version": "==1.10" + }, + "attrs": { + "hashes": [ + "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", + "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.4.0" + }, "beautifulsoup4": { "hashes": [ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", @@ -34,19 +50,19 @@ }, "boto3": { "hashes": [ - "sha256:aa00024cc1f3d24b2318dae4d5dbaa173c8da8bc6f9d12f0b2e67467ec460989", - "sha256:ab4ab2392f7520c01ce6e40e6df4b5b65a575ee6bd9fb78db0239cb2a06de557" + "sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b", + "sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61" ], "index": "pypi", - "version": "==1.21.3" + "version": "==1.21.7" }, "botocore": { "hashes": [ - "sha256:979e5c5e826ff115f4903fe9887b191f3809229f694a747f910e1221fe63efc7", - "sha256:ca33f747c67cd0e109fab9398d39c38c1a2df352c1e1f9823839df8f1db58046" + "sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c", + "sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50" ], "markers": "python_version >= '3.6'", - "version": "==1.24.3" + "version": "==1.24.7" }, "bs4": { "hashes": [ @@ -70,6 +86,61 @@ ], "version": "==2021.10.8" }, + "cffi": { + "hashes": [ + "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3", + "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2", + "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636", + "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20", + "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728", + "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27", + "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66", + "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443", + "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0", + "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7", + "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39", + "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605", + "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a", + "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37", + "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029", + "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139", + "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc", + "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df", + "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14", + "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880", + "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2", + "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a", + "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e", + "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474", + "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024", + "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8", + "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0", + "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e", + "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a", + "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e", + "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032", + "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6", + "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e", + "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b", + "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e", + "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954", + "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962", + "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c", + "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4", + "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55", + "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962", + "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023", + "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c", + "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6", + "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8", + "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382", + "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7", + "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc", + "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997", + "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796" + ], + "version": "==1.15.0" + }, "charset-normalizer": { "hashes": [ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", @@ -93,13 +164,30 @@ ], "version": "==1.2.58" }, - "faker": { + "cryptography": { "hashes": [ - "sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b", - "sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe" + "sha256:0a817b961b46894c5ca8a66b599c745b9a3d9f822725221f0e0fe49dc043a3a3", + "sha256:2d87cdcb378d3cfed944dac30596da1968f88fb96d7fc34fdae30a99054b2e31", + "sha256:30ee1eb3ebe1644d1c3f183d115a8c04e4e603ed6ce8e394ed39eea4a98469ac", + "sha256:391432971a66cfaf94b21c24ab465a4cc3e8bf4a939c1ca5c3e3a6e0abebdbcf", + "sha256:39bdf8e70eee6b1c7b289ec6e5d84d49a6bfa11f8b8646b5b3dfe41219153316", + "sha256:4caa4b893d8fad33cf1964d3e51842cd78ba87401ab1d2e44556826df849a8ca", + "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638", + "sha256:596f3cd67e1b950bc372c33f1a28a0692080625592ea6392987dba7f09f17a94", + "sha256:5d59a9d55027a8b88fd9fd2826c4392bd487d74bf628bb9d39beecc62a644c12", + "sha256:6c0c021f35b421ebf5976abf2daacc47e235f8b6082d3396a2fe3ccd537ab173", + "sha256:73bc2d3f2444bcfeac67dd130ff2ea598ea5f20b40e36d19821b4df8c9c5037b", + "sha256:74d6c7e80609c0f4c2434b97b80c7f8fdfaa072ca4baab7e239a15d6d70ed73a", + "sha256:7be0eec337359c155df191d6ae00a5e8bbb63933883f4f5dffc439dac5348c3f", + "sha256:94ae132f0e40fe48f310bba63f477f14a43116f05ddb69d6fa31e93f05848ae2", + "sha256:bb5829d027ff82aa872d76158919045a7c1e91fbf241aec32cb07956e9ebd3c9", + "sha256:ca238ceb7ba0bdf6ce88c1b74a87bffcee5afbfa1e41e173b1ceb095b39add46", + "sha256:ca28641954f767f9822c24e927ad894d45d5a1e501767599647259cbf030b903", + "sha256:e0344c14c9cb89e76eb6a060e67980c9e35b3f36691e15e1b7a9e58a0a6c6dc3", + "sha256:ebc15b1c22e55c4d5566e3ca4db8689470a0ca2babef8e3a9ee057a8b82ce4b1", + "sha256:ec63da4e7e4a5f924b90af42eddf20b698a70e58d86a72d943857c4c6045b3ee" ], - "markers": "python_version >= '3.6'", - "version": "==13.0.0" + "version": "==36.0.1" }, "ffmpeg-python": { "hashes": [ @@ -109,6 +197,14 @@ "index": "pypi", "version": "==0.2.0" }, + "filelock": { + "hashes": [ + "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85", + "sha256:f8314284bfffbdcfa0ff3d7992b023d4c628ced6feb957351d4c48d059f56bc0" + ], + "markers": "python_version >= '3.7'", + "version": "==3.6.0" + }, "flask": { "hashes": [ "sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f", @@ -134,11 +230,11 @@ }, "google-auth-oauthlib": { "hashes": [ - "sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73", - "sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a" + "sha256:06c4ceb3ab2a93b85b8976bbe86cbb82ae1d1c02d2ded3cfd0847a8b6955263b", + "sha256:8b7ff4d2fe81e3bd034306aa665444360b3c67195b9dea582dddc7dfb8d89d34" ], "markers": "python_version >= '3.6'", - "version": "==0.4.6" + "version": "==0.5.0" }, "gspread": { "hashes": [ @@ -148,6 +244,14 @@ "index": "pypi", "version": "==5.1.1" }, + "h11": { + "hashes": [ + "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06", + "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442" + ], + "markers": "python_version >= '3.6'", + "version": "==0.13.0" + }, "idna": { "hashes": [ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", @@ -188,6 +292,73 @@ "index": "pypi", "version": "==0.6.0" }, + "lxml": { + "hashes": [ + "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169", + "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428", + "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc", + "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85", + "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696", + "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507", + "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3", + "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430", + "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03", + "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9", + "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b", + "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7", + "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5", + "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654", + "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca", + "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9", + "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c", + "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63", + "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe", + "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9", + "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9", + "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1", + "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939", + "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68", + "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613", + "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63", + "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e", + "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4", + "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79", + "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1", + "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e", + "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141", + "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb", + "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939", + "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a", + "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93", + "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9", + "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2", + "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6", + "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa", + "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150", + "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea", + "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33", + "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76", + "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807", + "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a", + "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4", + "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15", + "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f", + "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429", + "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c", + "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5", + "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870", + "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b", + "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8", + "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c", + "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87", + "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0", + "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23", + "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170", + "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==4.8.0" + }, "markupsafe": { "hashes": [ "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", @@ -234,6 +405,14 @@ "markers": "python_version >= '3.7'", "version": "==2.1.0" }, + "mutagen": { + "hashes": [ + "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", + "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" + ], + "markers": "python_version >= '3.5' and python_version < '4'", + "version": "==1.45.1" + }, "oauthlib": { "hashes": [ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", @@ -242,6 +421,23 @@ "markers": "python_version >= '3.6'", "version": "==3.2.0" }, + "outcome": { + "hashes": [ + "sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958", + "sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967" + ], + "markers": "python_version >= '3.6'", + "version": "==1.1.0" + }, + "py-mini-racer": { + "hashes": [ + "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57", + "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2", + "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab", + "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11" + ], + "version": "==0.6.0" + }, "pyasn1": { "hashes": [ "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359", @@ -278,6 +474,53 @@ ], "version": "==0.2.8" }, + "pycparser": { + "hashes": [ + "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9", + "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206" + ], + "version": "==2.21" + }, + "pycryptodomex": { + "hashes": [ + "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", + "sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659", + "sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45", + "sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2", + "sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489", + "sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2", + "sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750", + "sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151", + "sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2", + "sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045", + "sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859", + "sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89", + "sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e", + "sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461", + "sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6", + "sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f", + "sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb", + "sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662", + "sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b", + "sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe", + "sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8", + "sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125", + "sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c", + "sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48", + "sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff", + "sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf", + "sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.14.1" + }, + "pyopenssl": { + "hashes": [ + "sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf", + "sha256:ea252b38c87425b64116f808355e8da644ef9b07e429398bfece610f893ee2e0" + ], + "version": "==22.0.0" + }, "pyparsing": { "hashes": [ "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", @@ -286,6 +529,14 @@ "markers": "python_version >= '3.6'", "version": "==3.0.7" }, + "pysocks": { + "hashes": [ + "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", + "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", + "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" + ], + "version": "==1.7.1" + }, "python-dateutil": { "hashes": [ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", @@ -303,6 +554,9 @@ "version": "==0.19.2" }, "requests": { + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -335,11 +589,18 @@ }, "s3transfer": { "hashes": [ - "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f", - "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a" + "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971", + "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed" ], "markers": "python_version >= '3.6'", - "version": "==0.5.1" + "version": "==0.5.2" + }, + "selenium": { + "hashes": [ + "sha256:7da6d7ab2c83a21e498deda02bb5e7fb0ac5da5e72438f6d01b015b185b5e1df" + ], + "index": "pypi", + "version": "==4.1.2" }, "six": { "hashes": [ @@ -349,6 +610,29 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, + "sniffio": { + "hashes": [ + "sha256:471b71698eac1c2112a40ce2752bb2f4a4814c22a54a3eed3676bc0f5ca9f663", + "sha256:c4666eecec1d3f50960c6bdf61ab7bc350648da6c126e3cf6898d8cd4ddcd3de" + ], + "markers": "python_version >= '3.5'", + "version": "==1.2.0" + }, + "snscrape": { + "hashes": [ + "sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3", + "sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2" + ], + "index": "pypi", + "version": "==0.4.3.20220106" + }, + "sortedcontainers": { + "hashes": [ + "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", + "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0" + ], + "version": "==2.4.0" + }, "soupsieve": { "hashes": [ "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb", @@ -359,9 +643,29 @@ }, "tiktok-downloader": { "git": "https://github.com/msramalho/tiktok-downloader", - "ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2" + "ref": "7bd8bb331d00ebdc317b8cc9c28ecbd83c89e03c" + }, + "trio": { + "hashes": [ + "sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070", + "sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a" + ], + "markers": "python_version >= '3.7'", + "version": "==0.20.0" + }, + "trio-websocket": { + "hashes": [ + "sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc", + "sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe" + ], + "markers": "python_version >= '3.5'", + "version": "==0.9.2" }, "urllib3": { + "extras": [ + "secure", + "socks" + ], "hashes": [ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" @@ -369,6 +673,60 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.8" }, + "websockets": { + "hashes": [ + "sha256:038afef2a05893578d10dadbdbb5f112bd115c46347e1efe99f6a356ff062138", + "sha256:05f6e9757017270e7a92a2975e2ae88a9a582ffc4629086fd6039aa80e99cd86", + "sha256:0b66421f9f13d4df60cd48ab977ed2c2b6c9147ae1a33caf5a9f46294422fda1", + "sha256:0cd02f36d37e503aca88ab23cc0a1a0e92a263d37acf6331521eb38040dcf77b", + "sha256:0f73cb2526d6da268e86977b2c4b58f2195994e53070fe567d5487c6436047e6", + "sha256:117383d0a17a0dda349f7a8790763dde75c1508ff8e4d6e8328b898b7df48397", + "sha256:1c1f3b18c8162e3b09761d0c6a0305fd642934202541cc511ef972cb9463261e", + "sha256:1c9031e90ebfc486e9cdad532b94004ade3aa39a31d3c46c105bb0b579cd2490", + "sha256:2349fa81b6b959484bb2bda556ccb9eb70ba68987646a0f8a537a1a18319fb03", + "sha256:24b879ba7db12bb525d4e58089fcbe6a3df3ce4666523183654170e86d372cbe", + "sha256:2aa9b91347ecd0412683f28aabe27f6bad502d89bd363b76e0a3508b1596402e", + "sha256:56d48eebe9e39ce0d68701bce3b21df923aa05dcc00f9fd8300de1df31a7c07c", + "sha256:5a38a0175ae82e4a8c4bac29fc01b9ee26d7d5a614e5ee11e7813c68a7d938ce", + "sha256:5b04270b5613f245ec84bb2c6a482a9d009aefad37c0575f6cda8499125d5d5c", + "sha256:6193bbc1ee63aadeb9a4d81de0e19477401d150d506aee772d8380943f118186", + "sha256:669e54228a4d9457abafed27cbf0e2b9f401445c4dfefc12bf8e4db9751703b8", + "sha256:6a009eb551c46fd79737791c0c833fc0e5b56bcd1c3057498b262d660b92e9cd", + "sha256:71a4491cfe7a9f18ee57d41163cb6a8a3fa591e0f0564ca8b0ed86b2a30cced4", + "sha256:7b38a5c9112e3dbbe45540f7b60c5204f49b3cb501b40950d6ab34cd202ab1d0", + "sha256:7bb9d8a6beca478c7e9bdde0159bd810cc1006ad6a7cb460533bae39da692ca2", + "sha256:82bc33db6d8309dc27a3bee11f7da2288ad925fcbabc2a4bb78f7e9c56249baf", + "sha256:8351c3c86b08156337b0e4ece0e3c5ec3e01fcd14e8950996832a23c99416098", + "sha256:8beac786a388bb99a66c3be4ab0fb38273c0e3bc17f612a4e0a47c4fc8b9c045", + "sha256:97950c7c844ec6f8d292440953ae18b99e3a6a09885e09d20d5e7ecd9b914cf8", + "sha256:98f57b3120f8331cd7440dbe0e776474f5e3632fdaa474af1f6b754955a47d71", + "sha256:9ca2ca05a4c29179f06cf6727b45dba5d228da62623ec9df4184413d8aae6cb9", + "sha256:a03a25d95cc7400bd4d61a63460b5d85a7761c12075ee2f51de1ffe73aa593d3", + "sha256:a10c0c1ee02164246f90053273a42d72a3b2452a7e7486fdae781138cf7fbe2d", + "sha256:a72b92f96e5e540d5dda99ee3346e199ade8df63152fa3c737260da1730c411f", + "sha256:ac081aa0307f263d63c5ff0727935c736c8dad51ddf2dc9f5d0c4759842aefaa", + "sha256:b22bdc795e62e71118b63e14a08bacfa4f262fd2877de7e5b950f5ac16b0348f", + "sha256:b4059e2ccbe6587b6dc9a01db5fc49ead9a884faa4076eea96c5ec62cb32f42a", + "sha256:b7fe45ae43ac814beb8ca09d6995b56800676f2cfa8e23f42839dc69bba34a42", + "sha256:bef03a51f9657fb03d8da6ccd233fe96e04101a852f0ffd35f5b725b28221ff3", + "sha256:bffc65442dd35c473ca9790a3fa3ba06396102a950794f536783f4b8060af8dd", + "sha256:c21a67ab9a94bd53e10bba21912556027fea944648a09e6508415ad14e37c325", + "sha256:c67d9cacb3f6537ca21e9b224d4fd08481538e43bcac08b3d93181b0816def39", + "sha256:c6e56606842bb24e16e36ae7eb308d866b4249cf0be8f63b212f287eeb76b124", + "sha256:cb316b87cbe3c0791c2ad92a5a36bf6adc87c457654335810b25048c1daa6fd5", + "sha256:cef40a1b183dcf39d23b392e9dd1d9b07ab9c46aadf294fff1350fb79146e72b", + "sha256:cf931c33db9c87c53d009856045dd524e4a378445693382a920fa1e0eb77c36c", + "sha256:d4d110a84b63c5cfdd22485acc97b8b919aefeecd6300c0c9d551e055b9a88ea", + "sha256:d5396710f86a306cf52f87fd8ea594a0e894ba0cc5a36059eaca3a477dc332aa", + "sha256:f09f46b1ff6d09b01c7816c50bd1903cf7d02ebbdb63726132717c2fcda835d5", + "sha256:f14bd10e170abc01682a9f8b28b16e6f20acf6175945ef38db6ffe31b0c72c3f", + "sha256:f5c335dc0e7dc271ef36df3f439868b3c790775f345338c2f61a562f1074187b", + "sha256:f8296b8408ec6853b26771599990721a26403e62b9de7e50ac0a056772ac0b5e", + "sha256:fa35c5d1830d0fb7b810324e9eeab9aa92e8f273f11fdbdc0741dcded6d72b9f" + ], + "markers": "python_version >= '3.7'", + "version": "==10.2" + }, "werkzeug": { "hashes": [ "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8", @@ -377,13 +735,21 @@ "markers": "python_version >= '3.6'", "version": "==2.0.3" }, - "youtube-dl": { + "wsproto": { "hashes": [ - "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2", - "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55" + "sha256:868776f8456997ad0d9720f7322b746bbe9193751b5b290b7f924659377c8c38", + "sha256:d8345d1808dd599b5ffb352c25a367adb6157e664e140dbecba3f9bc007edb9f" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==1.0.0" + }, + "yt-dlp": { + "hashes": [ + "sha256:81b50ed7cf9cfcc042d8f5a1ad2d1cd7b13c48b36c07faf1880696eac0a7ddb5", + "sha256:b0051920e066379acba6e253adba8bc1592e2ad1b7923df3a56793a4774b0cee" ], "index": "pypi", - "version": "==2021.12.17" + "version": "==2022.2.4" } }, "develop": {} diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 12cca80..a55b1ca 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -5,6 +5,10 @@ import shutil from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse +import hashlib +from selenium.common.exceptions import TimeoutException +from loguru import logger +import time from storages import Storage from utils import mkdir_if_not_exists @@ -19,13 +23,16 @@ class ArchiveResult: duration: float = None title: str = None timestamp: datetime.datetime = None + screenshot: str = None + hash: str = None class Archiver(ABC): name = "default" - def __init__(self, storage: Storage): + def __init__(self, storage: Storage, driver): self.storage = storage + self.driver = driver def __str__(self): return self.__class__.__name__ @@ -46,6 +53,26 @@ class Archiver(ABC): _id = _id.replace('unknown_video', 'jpg') return f'{self.name}_{_id}{extension}' + def get_hash(self, filename): + f = open(filename, "rb") + bytes = f.read() # read entire file as bytes + hash = hashlib.sha256(bytes) + f.close() + return hash.hexdigest() + + def get_screenshot(self, url): + key = self.get_key(urlparse(url).path.replace( + "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") + filename = 'tmp/' + key + + self.driver.get(url) + time.sleep(6) + + self.driver.save_screenshot(filename) + self.storage.upload(filename, key, extra_args={ + 'ACL': 'public-read', 'ContentType': 'image/png'}) + return self.storage.get_cdn_url(key) + def get_thumbnails(self, filename, key, duration=None): thumbnails_folder = filename.split('.')[0] + '/' key_folder = key.split('.')[0] + '/' diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index d7b8924..c43fc7d 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -49,6 +49,9 @@ class TelegramArchiver(Archiver): if status != 'already archived': self.storage.upload(filename, key) + hash = self.get_hash(filename) + screenshot = self.get_screenshot(url) + # extract duration from HTML duration = s.find_all('time')[0].contents[0] if ':' in duration: @@ -58,8 +61,9 @@ class TelegramArchiver(Archiver): duration = float(duration) # process thumbnails - key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration) + key_thumb, thumb_index = self.get_thumbnails( + filename, key, duration=duration) os.remove(filename) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) + duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 62aa415..6b5116f 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -43,12 +43,16 @@ class TiktokArchiver(Archiver): key_thumb = '' thumb_index = 'error creating thumbnails' + hash = self.get_hash(filename) + screenshot = self.get_screenshot(url) + try: os.remove(filename) except FileNotFoundError: logger.info(f'tmp file not found thus not deleted {filename}') return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) + thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(), + hash=hash, screenshot=screenshot) except tiktok_downloader.Except.InvalidUrl: status = 'Invalid URL' diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 53b356f..1fa98aa 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -8,8 +8,8 @@ from .base_archiver import Archiver, ArchiveResult class WaybackArchiver(Archiver): name = "wayback" - def __init__(self, storage: Storage): - super(WaybackArchiver, self).__init__(storage) + def __init__(self, storage: Storage, driver): + super(WaybackArchiver, self).__init__(storage, driver) self.seen_urls = {} def download(self, url, check_if_exists=False): @@ -71,6 +71,7 @@ class WaybackArchiver(Archiver): except: title = "Could not get title" - result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title) + screenshot = self.get_screenshot(url) + result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot) self.seen_urls[url] = result return result diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index ec11061..1e4c496 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -1,29 +1,29 @@ import os import datetime -import youtube_dl +import yt_dlp from loguru import logger from .base_archiver import Archiver, ArchiveResult class YoutubeDLArchiver(Archiver): - name = "yotube_dl" + name = "youtube_dl" ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'): logger.info('Using Facebook cookie') - youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') + yt_dlp.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts) + ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts) cdn_url = None status = 'success' try: info = ydl.extract_info(url, download=False) - except youtube_dl.utils.DownloadError: + except yt_dlp.utils.DownloadError: # no video here return False @@ -74,6 +74,9 @@ class YoutubeDLArchiver(Archiver): self.storage.upload(filename, key) + hash = self.get_hash(filename) + screenshot = self.get_screenshot(url) + # get duration duration = info.get('duration') @@ -89,4 +92,4 @@ class YoutubeDLArchiver(Archiver): timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, timestamp=timestamp) + title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)