Merge pull request #16 from bellingcat/screenshots

WIP: screenshots and hashing
pull/25/head
Logan Williams 2022-03-09 14:59:07 +01:00 zatwierdzone przez GitHub
commit d30115935e
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
12 zmienionych plików z 659 dodań i 83 usunięć

2
.gitignore vendored
Wyświetl plik

@ -6,3 +6,5 @@ service_account.json
__pycache__/
._*
anu.html
geckodriver.log

Wyświetl plik

@ -7,13 +7,15 @@ name = "pypi"
gspread = "*"
boto3 = "*"
python-dotenv = "*"
youtube_dl = "*"
argparse = "*"
beautifulsoup4 = "*"
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
bs4 = "*"
loguru = "*"
ffmpeg-python = "*"
selenium = "*"
snscrape = "*"
yt-dlp = "*"
[dev-packages]

412
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
"sha256": "0d910665da5b5d8da7ab3f03bab399cf615aaab5c036c0eb82a0c16e105cebbe"
},
"pipfile-spec": 6,
"requires": {
@ -24,6 +24,22 @@
"index": "pypi",
"version": "==1.4.0"
},
"async-generator": {
"hashes": [
"sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b",
"sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144"
],
"markers": "python_version >= '3.5'",
"version": "==1.10"
},
"attrs": {
"hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
"sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0"
},
"beautifulsoup4": {
"hashes": [
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
@ -34,19 +50,19 @@
},
"boto3": {
"hashes": [
"sha256:aa00024cc1f3d24b2318dae4d5dbaa173c8da8bc6f9d12f0b2e67467ec460989",
"sha256:ab4ab2392f7520c01ce6e40e6df4b5b65a575ee6bd9fb78db0239cb2a06de557"
"sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b",
"sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61"
],
"index": "pypi",
"version": "==1.21.3"
"version": "==1.21.7"
},
"botocore": {
"hashes": [
"sha256:979e5c5e826ff115f4903fe9887b191f3809229f694a747f910e1221fe63efc7",
"sha256:ca33f747c67cd0e109fab9398d39c38c1a2df352c1e1f9823839df8f1db58046"
"sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c",
"sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.3"
"version": "==1.24.7"
},
"bs4": {
"hashes": [
@ -70,6 +86,61 @@
],
"version": "==2021.10.8"
},
"cffi": {
"hashes": [
"sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
"sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
"sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
"sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
"sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
"sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
"sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
"sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
"sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
"sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
"sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
"sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
"sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
"sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
"sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
"sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
"sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
"sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
"sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
"sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
"sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
"sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
"sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
"sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
"sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
"sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
"sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
"sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
"sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
"sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
"sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
"sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
"sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
"sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
"sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
"sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
"sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
"sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
"sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
"sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
"sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
"sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
"sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
"sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
"sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
"sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
"sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
"sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
"sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
"sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
],
"version": "==1.15.0"
},
"charset-normalizer": {
"hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
@ -93,13 +164,30 @@
],
"version": "==1.2.58"
},
"faker": {
"cryptography": {
"hashes": [
"sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
"sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
"sha256:0a817b961b46894c5ca8a66b599c745b9a3d9f822725221f0e0fe49dc043a3a3",
"sha256:2d87cdcb378d3cfed944dac30596da1968f88fb96d7fc34fdae30a99054b2e31",
"sha256:30ee1eb3ebe1644d1c3f183d115a8c04e4e603ed6ce8e394ed39eea4a98469ac",
"sha256:391432971a66cfaf94b21c24ab465a4cc3e8bf4a939c1ca5c3e3a6e0abebdbcf",
"sha256:39bdf8e70eee6b1c7b289ec6e5d84d49a6bfa11f8b8646b5b3dfe41219153316",
"sha256:4caa4b893d8fad33cf1964d3e51842cd78ba87401ab1d2e44556826df849a8ca",
"sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638",
"sha256:596f3cd67e1b950bc372c33f1a28a0692080625592ea6392987dba7f09f17a94",
"sha256:5d59a9d55027a8b88fd9fd2826c4392bd487d74bf628bb9d39beecc62a644c12",
"sha256:6c0c021f35b421ebf5976abf2daacc47e235f8b6082d3396a2fe3ccd537ab173",
"sha256:73bc2d3f2444bcfeac67dd130ff2ea598ea5f20b40e36d19821b4df8c9c5037b",
"sha256:74d6c7e80609c0f4c2434b97b80c7f8fdfaa072ca4baab7e239a15d6d70ed73a",
"sha256:7be0eec337359c155df191d6ae00a5e8bbb63933883f4f5dffc439dac5348c3f",
"sha256:94ae132f0e40fe48f310bba63f477f14a43116f05ddb69d6fa31e93f05848ae2",
"sha256:bb5829d027ff82aa872d76158919045a7c1e91fbf241aec32cb07956e9ebd3c9",
"sha256:ca238ceb7ba0bdf6ce88c1b74a87bffcee5afbfa1e41e173b1ceb095b39add46",
"sha256:ca28641954f767f9822c24e927ad894d45d5a1e501767599647259cbf030b903",
"sha256:e0344c14c9cb89e76eb6a060e67980c9e35b3f36691e15e1b7a9e58a0a6c6dc3",
"sha256:ebc15b1c22e55c4d5566e3ca4db8689470a0ca2babef8e3a9ee057a8b82ce4b1",
"sha256:ec63da4e7e4a5f924b90af42eddf20b698a70e58d86a72d943857c4c6045b3ee"
],
"markers": "python_version >= '3.6'",
"version": "==13.0.0"
"version": "==36.0.1"
},
"ffmpeg-python": {
"hashes": [
@ -109,6 +197,14 @@
"index": "pypi",
"version": "==0.2.0"
},
"filelock": {
"hashes": [
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
"sha256:f8314284bfffbdcfa0ff3d7992b023d4c628ced6feb957351d4c48d059f56bc0"
],
"markers": "python_version >= '3.7'",
"version": "==3.6.0"
},
"flask": {
"hashes": [
"sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f",
@ -134,11 +230,11 @@
},
"google-auth-oauthlib": {
"hashes": [
"sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73",
"sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a"
"sha256:06c4ceb3ab2a93b85b8976bbe86cbb82ae1d1c02d2ded3cfd0847a8b6955263b",
"sha256:8b7ff4d2fe81e3bd034306aa665444360b3c67195b9dea582dddc7dfb8d89d34"
],
"markers": "python_version >= '3.6'",
"version": "==0.4.6"
"version": "==0.5.0"
},
"gspread": {
"hashes": [
@ -148,6 +244,14 @@
"index": "pypi",
"version": "==5.1.1"
},
"h11": {
"hashes": [
"sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06",
"sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"
],
"markers": "python_version >= '3.6'",
"version": "==0.13.0"
},
"idna": {
"hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
@ -188,6 +292,73 @@
"index": "pypi",
"version": "==0.6.0"
},
"lxml": {
"hashes": [
"sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
"sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
"sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
"sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
"sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
"sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
"sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
"sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
"sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
"sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
"sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
"sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
"sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
"sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
"sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
"sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
"sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
"sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
"sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
"sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
"sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
"sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
"sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
"sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
"sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
"sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
"sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
"sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
"sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
"sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
"sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
"sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
"sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
"sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
"sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
"sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
"sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
"sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
"sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
"sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
"sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
"sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
"sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
"sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
"sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
"sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
"sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
"sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
"sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
"sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
"sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
"sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
"sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
"sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
"sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
"sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
"sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
"sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
"sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
"sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
"sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.8.0"
},
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
@ -234,6 +405,14 @@
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"mutagen": {
"hashes": [
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
],
"markers": "python_version >= '3.5' and python_version < '4'",
"version": "==1.45.1"
},
"oauthlib": {
"hashes": [
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
@ -242,6 +421,23 @@
"markers": "python_version >= '3.6'",
"version": "==3.2.0"
},
"outcome": {
"hashes": [
"sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958",
"sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967"
],
"markers": "python_version >= '3.6'",
"version": "==1.1.0"
},
"py-mini-racer": {
"hashes": [
"sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57",
"sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
"sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
"sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
],
"version": "==0.6.0"
},
"pyasn1": {
"hashes": [
"sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
@ -278,6 +474,53 @@
],
"version": "==0.2.8"
},
"pycparser": {
"hashes": [
"sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9",
"sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"
],
"version": "==2.21"
},
"pycryptodomex": {
"hashes": [
"sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a",
"sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659",
"sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45",
"sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2",
"sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489",
"sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2",
"sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750",
"sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151",
"sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2",
"sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045",
"sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859",
"sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89",
"sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e",
"sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461",
"sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6",
"sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f",
"sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb",
"sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662",
"sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b",
"sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe",
"sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8",
"sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125",
"sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c",
"sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48",
"sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff",
"sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf",
"sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.14.1"
},
"pyopenssl": {
"hashes": [
"sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf",
"sha256:ea252b38c87425b64116f808355e8da644ef9b07e429398bfece610f893ee2e0"
],
"version": "==22.0.0"
},
"pyparsing": {
"hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@ -286,6 +529,14 @@
"markers": "python_version >= '3.6'",
"version": "==3.0.7"
},
"pysocks": {
"hashes": [
"sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299",
"sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5",
"sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"
],
"version": "==1.7.1"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
@ -303,6 +554,9 @@
"version": "==0.19.2"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@ -335,11 +589,18 @@
},
"s3transfer": {
"hashes": [
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
"sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"
],
"markers": "python_version >= '3.6'",
"version": "==0.5.1"
"version": "==0.5.2"
},
"selenium": {
"hashes": [
"sha256:7da6d7ab2c83a21e498deda02bb5e7fb0ac5da5e72438f6d01b015b185b5e1df"
],
"index": "pypi",
"version": "==4.1.2"
},
"six": {
"hashes": [
@ -349,6 +610,29 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"sniffio": {
"hashes": [
"sha256:471b71698eac1c2112a40ce2752bb2f4a4814c22a54a3eed3676bc0f5ca9f663",
"sha256:c4666eecec1d3f50960c6bdf61ab7bc350648da6c126e3cf6898d8cd4ddcd3de"
],
"markers": "python_version >= '3.5'",
"version": "==1.2.0"
},
"snscrape": {
"hashes": [
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
"sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2"
],
"index": "pypi",
"version": "==0.4.3.20220106"
},
"sortedcontainers": {
"hashes": [
"sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88",
"sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"
],
"version": "==2.4.0"
},
"soupsieve": {
"hashes": [
"sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb",
@ -359,9 +643,29 @@
},
"tiktok-downloader": {
"git": "https://github.com/msramalho/tiktok-downloader",
"ref": "81c6ea1f959b2cc5620961043272592bd1bfc2e2"
"ref": "7bd8bb331d00ebdc317b8cc9c28ecbd83c89e03c"
},
"trio": {
"hashes": [
"sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070",
"sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a"
],
"markers": "python_version >= '3.7'",
"version": "==0.20.0"
},
"trio-websocket": {
"hashes": [
"sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc",
"sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe"
],
"markers": "python_version >= '3.5'",
"version": "==0.9.2"
},
"urllib3": {
"extras": [
"secure",
"socks"
],
"hashes": [
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
@ -369,6 +673,60 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8"
},
"websockets": {
"hashes": [
"sha256:038afef2a05893578d10dadbdbb5f112bd115c46347e1efe99f6a356ff062138",
"sha256:05f6e9757017270e7a92a2975e2ae88a9a582ffc4629086fd6039aa80e99cd86",
"sha256:0b66421f9f13d4df60cd48ab977ed2c2b6c9147ae1a33caf5a9f46294422fda1",
"sha256:0cd02f36d37e503aca88ab23cc0a1a0e92a263d37acf6331521eb38040dcf77b",
"sha256:0f73cb2526d6da268e86977b2c4b58f2195994e53070fe567d5487c6436047e6",
"sha256:117383d0a17a0dda349f7a8790763dde75c1508ff8e4d6e8328b898b7df48397",
"sha256:1c1f3b18c8162e3b09761d0c6a0305fd642934202541cc511ef972cb9463261e",
"sha256:1c9031e90ebfc486e9cdad532b94004ade3aa39a31d3c46c105bb0b579cd2490",
"sha256:2349fa81b6b959484bb2bda556ccb9eb70ba68987646a0f8a537a1a18319fb03",
"sha256:24b879ba7db12bb525d4e58089fcbe6a3df3ce4666523183654170e86d372cbe",
"sha256:2aa9b91347ecd0412683f28aabe27f6bad502d89bd363b76e0a3508b1596402e",
"sha256:56d48eebe9e39ce0d68701bce3b21df923aa05dcc00f9fd8300de1df31a7c07c",
"sha256:5a38a0175ae82e4a8c4bac29fc01b9ee26d7d5a614e5ee11e7813c68a7d938ce",
"sha256:5b04270b5613f245ec84bb2c6a482a9d009aefad37c0575f6cda8499125d5d5c",
"sha256:6193bbc1ee63aadeb9a4d81de0e19477401d150d506aee772d8380943f118186",
"sha256:669e54228a4d9457abafed27cbf0e2b9f401445c4dfefc12bf8e4db9751703b8",
"sha256:6a009eb551c46fd79737791c0c833fc0e5b56bcd1c3057498b262d660b92e9cd",
"sha256:71a4491cfe7a9f18ee57d41163cb6a8a3fa591e0f0564ca8b0ed86b2a30cced4",
"sha256:7b38a5c9112e3dbbe45540f7b60c5204f49b3cb501b40950d6ab34cd202ab1d0",
"sha256:7bb9d8a6beca478c7e9bdde0159bd810cc1006ad6a7cb460533bae39da692ca2",
"sha256:82bc33db6d8309dc27a3bee11f7da2288ad925fcbabc2a4bb78f7e9c56249baf",
"sha256:8351c3c86b08156337b0e4ece0e3c5ec3e01fcd14e8950996832a23c99416098",
"sha256:8beac786a388bb99a66c3be4ab0fb38273c0e3bc17f612a4e0a47c4fc8b9c045",
"sha256:97950c7c844ec6f8d292440953ae18b99e3a6a09885e09d20d5e7ecd9b914cf8",
"sha256:98f57b3120f8331cd7440dbe0e776474f5e3632fdaa474af1f6b754955a47d71",
"sha256:9ca2ca05a4c29179f06cf6727b45dba5d228da62623ec9df4184413d8aae6cb9",
"sha256:a03a25d95cc7400bd4d61a63460b5d85a7761c12075ee2f51de1ffe73aa593d3",
"sha256:a10c0c1ee02164246f90053273a42d72a3b2452a7e7486fdae781138cf7fbe2d",
"sha256:a72b92f96e5e540d5dda99ee3346e199ade8df63152fa3c737260da1730c411f",
"sha256:ac081aa0307f263d63c5ff0727935c736c8dad51ddf2dc9f5d0c4759842aefaa",
"sha256:b22bdc795e62e71118b63e14a08bacfa4f262fd2877de7e5b950f5ac16b0348f",
"sha256:b4059e2ccbe6587b6dc9a01db5fc49ead9a884faa4076eea96c5ec62cb32f42a",
"sha256:b7fe45ae43ac814beb8ca09d6995b56800676f2cfa8e23f42839dc69bba34a42",
"sha256:bef03a51f9657fb03d8da6ccd233fe96e04101a852f0ffd35f5b725b28221ff3",
"sha256:bffc65442dd35c473ca9790a3fa3ba06396102a950794f536783f4b8060af8dd",
"sha256:c21a67ab9a94bd53e10bba21912556027fea944648a09e6508415ad14e37c325",
"sha256:c67d9cacb3f6537ca21e9b224d4fd08481538e43bcac08b3d93181b0816def39",
"sha256:c6e56606842bb24e16e36ae7eb308d866b4249cf0be8f63b212f287eeb76b124",
"sha256:cb316b87cbe3c0791c2ad92a5a36bf6adc87c457654335810b25048c1daa6fd5",
"sha256:cef40a1b183dcf39d23b392e9dd1d9b07ab9c46aadf294fff1350fb79146e72b",
"sha256:cf931c33db9c87c53d009856045dd524e4a378445693382a920fa1e0eb77c36c",
"sha256:d4d110a84b63c5cfdd22485acc97b8b919aefeecd6300c0c9d551e055b9a88ea",
"sha256:d5396710f86a306cf52f87fd8ea594a0e894ba0cc5a36059eaca3a477dc332aa",
"sha256:f09f46b1ff6d09b01c7816c50bd1903cf7d02ebbdb63726132717c2fcda835d5",
"sha256:f14bd10e170abc01682a9f8b28b16e6f20acf6175945ef38db6ffe31b0c72c3f",
"sha256:f5c335dc0e7dc271ef36df3f439868b3c790775f345338c2f61a562f1074187b",
"sha256:f8296b8408ec6853b26771599990721a26403e62b9de7e50ac0a056772ac0b5e",
"sha256:fa35c5d1830d0fb7b810324e9eeab9aa92e8f273f11fdbdc0741dcded6d72b9f"
],
"markers": "python_version >= '3.7'",
"version": "==10.2"
},
"werkzeug": {
"hashes": [
"sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
@ -377,13 +735,21 @@
"markers": "python_version >= '3.6'",
"version": "==2.0.3"
},
"youtube-dl": {
"wsproto": {
"hashes": [
"sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
"sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
"sha256:868776f8456997ad0d9720f7322b746bbe9193751b5b290b7f924659377c8c38",
"sha256:d8345d1808dd599b5ffb352c25a367adb6157e664e140dbecba3f9bc007edb9f"
],
"markers": "python_full_version >= '3.6.1'",
"version": "==1.0.0"
},
"yt-dlp": {
"hashes": [
"sha256:81b50ed7cf9cfcc042d8f5a1ad2d1cd7b13c48b36c07faf1880696eac0a7ddb5",
"sha256:b0051920e066379acba6e253adba8bc1592e2ad1b7923df3a56793a4774b0cee"
],
"index": "pypi",
"version": "==2021.12.17"
"version": "==2022.2.4"
}
},
"develop": {}

Wyświetl plik

@ -3,4 +3,5 @@ from .base_archiver import *
from .telegram_archiver import *
from .tiktok_archiver import *
from .wayback_archiver import *
from .youtubedl_archiver import *
from .youtubedl_archiver import *
from .twitter_archiver import *

Wyświetl plik

@ -5,6 +5,11 @@ import shutil
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
import hashlib
from selenium.common.exceptions import TimeoutException
from loguru import logger
import time
import requests
from storages import Storage
from utils import mkdir_if_not_exists
@ -19,13 +24,16 @@ class ArchiveResult:
duration: float = None
title: str = None
timestamp: datetime.datetime = None
screenshot: str = None
hash: str = None
class Archiver(ABC):
name = "default"
def __init__(self, storage: Storage):
def __init__(self, storage: Storage, driver):
self.storage = storage
self.driver = driver
def __str__(self):
return self.__class__.__name__
@ -36,6 +44,56 @@ class Archiver(ABC):
def get_netloc(self, url):
return urlparse(url).netloc
def generate_media_page(self, urls, url, object):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
page = f'''<html><head><title>{url}</title></head>
<body>
<h2>Archived media from {self.name}</h2>
<h3><a href="{url}">{url}</a></h3><ul>'''
thumbnail = None
for media_url in urls:
path = urlparse(media_url).path
key = self.get_key(path.replace("/", "_"))
if '.' not in path:
key += '.jpg'
filename = 'tmp/' + key
d = requests.get(media_url, headers=headers)
with open(filename, 'wb') as f:
f.write(d.content)
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
if thumbnail is None:
thumbnail = cdn_url
page += f'''<li><a href="{cdn_url}">{media_url}</a>: {hash}</li>'''
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
page += f"</body></html>"
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = 'tmp/' + page_key
page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f:
f.write(page)
page_hash = self.get_hash(page_filename)
self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
return (page_cdn, page_hash, thumbnail)
def get_key(self, filename):
"""
returns a key in the format "[archiverName]_[filename]" includes extension
@ -44,8 +102,33 @@ class Archiver(ABC):
_id, extension = os.path.splitext(tail) # returns [filename, .ext]
if 'unknown_video' in _id:
_id = _id.replace('unknown_video', 'jpg')
# long filenames can cause problems, so trim them if necessary
if len(_id) > 128:
_id = _id[-128:]
return f'{self.name}_{_id}{extension}'
def get_hash(self, filename):
f = open(filename, "rb")
bytes = f.read() # read entire file as bytes
hash = hashlib.sha256(bytes)
f.close()
return hash.hexdigest()
def get_screenshot(self, url):
key = self.get_key(urlparse(url).path.replace(
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key
self.driver.get(url)
time.sleep(6)
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'image/png'})
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = filename.split('.')[0] + '/'
key_folder = key.split('.')[0] + '/'
@ -99,7 +182,8 @@ class Archiver(ABC):
thumb_index = key_folder + 'index.html'
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
self.storage.upload(index_fname, thumb_index, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
shutil.rmtree(thumbnails_folder)
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)

Wyświetl plik

@ -1,6 +1,9 @@
import os
import requests
from bs4 import BeautifulSoup
from loguru import logger
import re
import html
from .base_archiver import Archiver, ArchiveResult
@ -24,12 +27,24 @@ class TelegramArchiver(Archiver):
if url[-8:] != "?embed=1":
url += "?embed=1"
screenshot = self.get_screenshot(url)
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
return False # could not find video
logger.warning("could not find video")
image_tags = s.find_all(class_="js-message_photo")
images = []
for im in image_tags:
urls = [u.replace("'", "") for u in re.findall('url\((.*?)\)', im['style'])]
images += urls
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=s.find_all('time')[0].get('datetime'))
video_url = video.get('src')
video_id = video_url.split('/')[-1].split('?')[0]
@ -49,17 +64,20 @@ class TelegramArchiver(Archiver):
if status != 'already archived':
self.storage.upload(filename, key)
hash = self.get_hash(filename)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(':')[0]) * 60
+ float(duration.split(':')[1])
duration = float(duration.split(
':')[0]) * 60 + float(duration.split(':')[1])
else:
duration = float(duration)
# process thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
key_thumb, thumb_index = self.get_thumbnails(
filename, key, duration=duration)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)

Wyświetl plik

@ -43,12 +43,16 @@ class TiktokArchiver(Archiver):
key_thumb = ''
thumb_index = 'error creating thumbnails'
hash = self.get_hash(filename)
screenshot = self.get_screenshot(url)
try: os.remove(filename)
except FileNotFoundError:
logger.info(f'tmp file not found thus not deleted {filename}')
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
hash=hash, screenshot=screenshot)
except tiktok_downloader.Except.InvalidUrl:
status = 'Invalid URL'

Wyświetl plik

@ -0,0 +1,52 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger
import requests
from urllib.parse import urlparse
from .base_archiver import Archiver, ArchiveResult
class TwitterArchiver(Archiver):
name = "twitter"
def download(self, url, check_if_exists=False):
if 'twitter.com' != self.get_netloc(url):
return False
tweet_id = urlparse(url).path.split('/')
if 'status' in tweet_id:
i = tweet_id.index('status')
tweet_id = tweet_id[i+1]
else:
return False
scr = TwitterTweetScraper(tweet_id)
try:
tweet = next(scr.get_items())
except:
logger.warning('wah wah')
return False
if tweet.media is None:
return False
urls = []
for media in tweet.media:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
urls.append(variant.url)
elif type(media) == Gif:
urls.append(media.variants[0].url)
elif type(media) == Photo:
urls.append(media.fullUrl)
else:
logger.warning(f"Could not get media URL of {media}")
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)

Wyświetl plik

@ -8,8 +8,8 @@ from .base_archiver import Archiver, ArchiveResult
class WaybackArchiver(Archiver):
name = "wayback"
def __init__(self, storage: Storage):
super(WaybackArchiver, self).__init__(storage)
def __init__(self, storage: Storage, driver):
super(WaybackArchiver, self).__init__(storage, driver)
self.seen_urls = {}
def download(self, url, check_if_exists=False):
@ -71,6 +71,7 @@ class WaybackArchiver(Archiver):
except:
title = "Could not get title"
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
screenshot = self.get_screenshot(url)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
self.seen_urls[url] = result
return result

Wyświetl plik

@ -1,29 +1,29 @@
import os
import datetime
import youtube_dl
import yt_dlp
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver):
name = "yotube_dl"
name = "youtube_dl"
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
def download(self, url, check_if_exists=False):
netloc = self.get_netloc(url)
if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
logger.info('Using Facebook cookie')
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
yt_dlp.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts)
cdn_url = None
status = 'success'
try:
info = ydl.extract_info(url, download=False)
except youtube_dl.utils.DownloadError:
except yt_dlp.utils.DownloadError:
# no video here
return False
@ -74,6 +74,9 @@ class YoutubeDLArchiver(Archiver):
self.storage.upload(filename, key)
hash = self.get_hash(filename)
screenshot = self.get_screenshot(url)
# get duration
duration = info.get('duration')
@ -86,7 +89,11 @@ class YoutubeDLArchiver(Archiver):
os.remove(filename)
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
if 'timestamp' in info else \
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
if 'upload_date' in info and info['upload_date'] is not None else \
None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp)
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)

Wyświetl plik

@ -6,6 +6,7 @@ import shutil
import gspread
from loguru import logger
from dotenv import load_dotenv
from selenium import webdriver
import archivers
from storages import S3Storage, S3Config
@ -26,15 +27,24 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
cell_updates.append((row, 'status', result.status))
batch_if_valid('archive', result.cdn_url)
batch_if_valid('date', True, datetime.datetime.now().isoformat())
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('thumbnail', result.thumbnail,
f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail_index', result.thumbnail_index)
batch_if_valid('title', result.title)
batch_if_valid('duration', result.duration, str(result.duration))
batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash)
if result.timestamp and type(result.timestamp) != str:
result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
batch_if_valid('timestamp', result.timestamp)
if result.timestamp is not None:
if type(result.timestamp) == int:
timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
elif type(result.timestamp) == str:
timestamp_string = result.timestamp
else:
timestamp_string = result.timestamp.isoformat()
batch_if_valid('timestamp', timestamp_string)
gw.batch_set_cell(cell_updates)
@ -50,7 +60,7 @@ def expand_url(url):
return url
def process_sheet(sheet):
def process_sheet(sheet, header=1):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
@ -61,73 +71,97 @@ def process_sheet(sheet):
secret=os.getenv('DO_SPACES_SECRET')
)
options = webdriver.FirefoxOptions()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.set_window_size(1400, 2000)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
gw = GWorksheet(wks)
gw = GWorksheet(wks, header_row=header)
if not gw.col_exists('url'):
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
logger.warning(
f'No "Media URL" column found, skipping worksheet {wks.title}')
continue
if not gw.col_exists('status'):
logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}')
logger.warning(
f'No "Archive status" column found, skipping worksheet {wks.title}')
continue
# archives will be in a folder 'doc_name/worksheet_name'
s3_config.folder = f'{sheet}/{wks.title}/'
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
s3_client = S3Storage(s3_config)
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelegramArchiver(s3_client),
archivers.TiktokArchiver(s3_client),
archivers.YoutubeDLArchiver(s3_client),
archivers.WaybackArchiver(s3_client)
archivers.TelegramArchiver(s3_client, driver),
archivers.TiktokArchiver(s3_client, driver),
archivers.YoutubeDLArchiver(s3_client, driver),
archivers.TwitterArchiver(s3_client, driver),
archivers.WaybackArchiver(s3_client, driver)
]
values = gw.get_values()
# loop through rows in worksheet
for row in range(2, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
status = gw.get_cell(row, 'status')
for row in range(1 + header, gw.count_rows() + 1):
row_values = values[row-1]
url = gw.get_cell(row_values, 'url')
status = gw.get_cell(row_values, 'status')
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
url = gw.get_cell(row, 'url')
status = gw.get_cell(status, 'status')
url = expand_url(url)
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
url = expand_url(url)
# TODO: add support for multiple videos/images
try:
result = archiver.download(url, check_if_exists=True)
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
try:
result = archiver.download(url, check_if_exists=True)
except Exception as e:
result = False
logger.error(
f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
if result:
if result.status in ['success', 'already archived']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
f'{archiver} succeeded on row {row}')
break
logger.warning(
f'{archiver} did not succeed on row {row}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
if result:
if result.status in ['success', 'already archived']:
logger.success(f'{archiver} succeeded on row {row}')
break
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
if result:
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
driver.quit()
def main():
parser = argparse.ArgumentParser(
description='Automatically archive social media videos from a Google Sheets document')
parser.add_argument('--sheet', action='store', dest='sheet')
parser.add_argument('--header', action='store', dest='header', default=1, type=int)
args = parser.parse_args()
logger.info(f'Opening document {args.sheet}')
mkdir_if_not_exists('tmp')
process_sheet(args.sheet)
process_sheet(args.sheet, header=args.header)
shutil.rmtree('tmp')
if __name__ == '__main__':
main()

Wyświetl plik

@ -3,7 +3,7 @@ from gspread import utils
class GWorksheet:
COLUMN_NAMES = {
'url': 'media url',
'url': 'link',
'archive': 'archive location',
'date': 'archive date',
'status': 'archive status',
@ -11,12 +11,14 @@ class GWorksheet:
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
'duration': 'duration'
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash'
}
def __init__(self, worksheet, columns=COLUMN_NAMES):
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
self.wks = worksheet
self.headers = [v.lower() for v in self.wks.row_values(1)]
self.headers = [v.lower() for v in self.wks.row_values(header_row)]
self.columns = columns
def _check_col_exists(self, col: str):
@ -38,6 +40,9 @@ class GWorksheet:
# row is 1-based
return self.wks.row_values(row)
def get_values(self):
return self.wks.get_values()
def get_cell(self, row, col: str):
"""
returns the cell value from (row, col),