Merge branch 'main' of github.com:bellingcat/auto-archiver into main

pull/25/head
Logan Williams 2022-03-18 09:53:29 +01:00
commit 538bb05395
11 zmienionych plików z 317 dodań i 115 usunięć

8
.example.env 100644
Wyświetl plik

@ -0,0 +1,8 @@
DO_SPACES_REGION=
DO_SPACES_KEY=
DO_SPACES_SECRET=
DO_BUCKET=
INTERNET_ARCHIVE_S3_KEY=
INTERNET_ARCHIVE_S3_SECRET=
TELEGRAM_API_ID=
TELEGRAM_API_HASH=

5
.gitignore vendored
Wyświetl plik

@ -6,5 +6,6 @@ service_account.json
__pycache__/ __pycache__/
._* ._*
anu.html anu.html
geckodriver.log *.log
.pytest_cach
anon*

Wyświetl plik

@ -16,6 +16,7 @@ ffmpeg-python = "*"
selenium = "*" selenium = "*"
snscrape = "*" snscrape = "*"
yt-dlp = "*" yt-dlp = "*"
telethon = "*"
[dev-packages] [dev-packages]

242
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "0d910665da5b5d8da7ab3f03bab399cf615aaab5c036c0eb82a0c16e105cebbe" "sha256": "e27ea0a6fdf6e588c14fbb90af45f784b9e55a9b986a3b50770490648ba96720"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -50,19 +50,87 @@
}, },
"boto3": { "boto3": {
"hashes": [ "hashes": [
"sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b", "sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c",
"sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61" "sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f"
], ],
"index": "pypi", "index": "pypi",
"version": "==1.21.7" "version": "==1.21.19"
}, },
"botocore": { "botocore": {
"hashes": [ "hashes": [
"sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c", "sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea",
"sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50" "sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==1.24.7" "version": "==1.24.19"
},
"brotli": {
"hashes": [
"sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
"sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
"sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
"sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
"sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
"sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
"sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
"sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
"sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
"sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
"sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
"sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
"sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
"sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
"sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
"sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
"sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
"sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
"sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
"sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
"sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
"sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
"sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
"sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
"sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
"sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
"sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
"sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
"sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
"sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
"sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
"sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
"sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
"sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
"sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
],
"markers": "platform_python_implementation == 'CPython'",
"version": "==1.0.9"
}, },
"bs4": { "bs4": {
"hashes": [ "hashes": [
@ -159,10 +227,10 @@
}, },
"cloudscraper": { "cloudscraper": {
"hashes": [ "hashes": [
"sha256:674fd739f9412188aae8d6614e3e6316939fc0670ef5646abd3d316f1a59d3c2", "sha256:0d3413b2fff9f7cf79513b0c9aac58b527c5a2c5163d1c7cc0c4f8cca1d0f4e7",
"sha256:dda29028c5628b5ba3e4dc43816ed38fd46bd945ef938c420f185586a6d8dff2" "sha256:37afe061dee27c6ee1524ff3ca779ef0cc1d63298c3b65ec25f86d1e4b4a2eb9"
], ],
"version": "==1.2.58" "version": "==1.2.60"
}, },
"cryptography": { "cryptography": {
"hashes": [ "hashes": [
@ -189,6 +257,14 @@
], ],
"version": "==36.0.1" "version": "==36.0.1"
}, },
"faker": {
"hashes": [
"sha256:66db859b6abe376d02e805ad81eb8dcfce38f0945f17ee7cdf74ed349985ea52",
"sha256:fe969607836ce7100e38b88dcb598aacb733d895e6e9401894dd603e35623000"
],
"markers": "python_version >= '3.6'",
"version": "==13.3.2"
},
"ffmpeg-python": { "ffmpeg-python": {
"hashes": [ "hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@ -230,19 +306,19 @@
}, },
"google-auth-oauthlib": { "google-auth-oauthlib": {
"hashes": [ "hashes": [
"sha256:06c4ceb3ab2a93b85b8976bbe86cbb82ae1d1c02d2ded3cfd0847a8b6955263b", "sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
"sha256:8b7ff4d2fe81e3bd034306aa665444360b3c67195b9dea582dddc7dfb8d89d34" "sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==0.5.0" "version": "==0.5.1"
}, },
"gspread": { "gspread": {
"hashes": [ "hashes": [
"sha256:d9db8c43d552f541ea072d4727d1e955bc2368b095dd86c5429a845c9d8aed8f", "sha256:05297b49587b5e89c2a0aa39967f43e5b7f170b62c11ddd43214baa1085131a8",
"sha256:ffba57786e27519fb97125e3de37a0f062134a396506681f5baacaf47a9febe3" "sha256:25173ac081469cf9d621514c6576c6cf46f39c825f178b8cb9e78374a637b0bf"
], ],
"index": "pypi", "index": "pypi",
"version": "==5.1.1" "version": "==5.2.0"
}, },
"h11": { "h11": {
"hashes": [ "hashes": [
@ -262,11 +338,11 @@
}, },
"itsdangerous": { "itsdangerous": {
"hashes": [ "hashes": [
"sha256:29285842166554469a56d427addc0843914172343784cb909695fdbe90a3e129", "sha256:7b7d3023cd35d9cb0c1fd91392f8c95c6fa02c59bf8ad64b8849be3401b95afb",
"sha256:d848fcb8bc7d507c4546b448574e8a44fc4ea2ba84ebf8d783290d53e81992f5" "sha256:935642cd4b987cdbee7210080004033af76306757ff8b4c0a506a4b6e06f02cf"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.1.0" "version": "==2.1.1"
}, },
"jinja2": { "jinja2": {
"hashes": [ "hashes": [
@ -361,49 +437,49 @@
}, },
"markupsafe": { "markupsafe": {
"hashes": [ "hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8", "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759", "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed", "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989", "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3", "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a", "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c", "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c", "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8", "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454", "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad", "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d", "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635", "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61", "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea", "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49", "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce", "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e", "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f", "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f", "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f", "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7", "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a", "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7", "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076", "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb", "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7", "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7", "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c", "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26", "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c", "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8", "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448", "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956", "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05", "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1", "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357", "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea", "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730" "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.1.0" "version": "==2.1.1"
}, },
"mutagen": { "mutagen": {
"hashes": [ "hashes": [
@ -429,14 +505,11 @@
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==1.1.0" "version": "==1.1.0"
}, },
"py-mini-racer": { "pyaes": {
"hashes": [ "hashes": [
"sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57", "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
"sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
"sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
"sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
], ],
"version": "==0.6.0" "version": "==1.6.1"
}, },
"pyasn1": { "pyasn1": {
"hashes": [ "hashes": [
@ -554,9 +627,7 @@
"version": "==0.19.2" "version": "==0.19.2"
}, },
"requests": { "requests": {
"extras": [ "extras": [],
"socks"
],
"hashes": [ "hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@ -584,7 +655,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.8" "version": "==4.8"
}, },
"s3transfer": { "s3transfer": {
@ -597,10 +668,10 @@
}, },
"selenium": { "selenium": {
"hashes": [ "hashes": [
"sha256:7da6d7ab2c83a21e498deda02bb5e7fb0ac5da5e72438f6d01b015b185b5e1df" "sha256:14d28a628c831c105d38305c881c9c7847199bfd728ec84240c5e86fa1c9bd5a"
], ],
"index": "pypi", "index": "pypi",
"version": "==4.1.2" "version": "==4.1.3"
}, },
"six": { "six": {
"hashes": [ "hashes": [
@ -641,6 +712,14 @@
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==2.3.1" "version": "==2.3.1"
}, },
"telethon": {
"hashes": [
"sha256:04fdc5fa4ed3e886e6ecf4bad79205ab8880c6aefbd42c29c89c689a502aa816",
"sha256:818cb61281ed3f75ba4da9b68cb69486bed9474d2db4e0aa16e482053117452c"
],
"index": "pypi",
"version": "==1.24.0"
},
"tiktok-downloader": { "tiktok-downloader": {
"git": "https://github.com/msramalho/tiktok-downloader", "git": "https://github.com/msramalho/tiktok-downloader",
"ref": "7bd8bb331d00ebdc317b8cc9c28ecbd83c89e03c" "ref": "7bd8bb331d00ebdc317b8cc9c28ecbd83c89e03c"
@ -662,10 +741,7 @@
"version": "==0.9.2" "version": "==0.9.2"
}, },
"urllib3": { "urllib3": {
"extras": [ "extras": [],
"secure",
"socks"
],
"hashes": [ "hashes": [
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
@ -737,19 +813,19 @@
}, },
"wsproto": { "wsproto": {
"hashes": [ "hashes": [
"sha256:868776f8456997ad0d9720f7322b746bbe9193751b5b290b7f924659377c8c38", "sha256:2218cb57952d90b9fca325c0dcfb08c3bda93e8fd8070b0a17f048e2e47a521b",
"sha256:d8345d1808dd599b5ffb352c25a367adb6157e664e140dbecba3f9bc007edb9f" "sha256:a2e56bfd5c7cd83c1369d83b5feccd6d37798b74872866e62616e0ecf111bda8"
], ],
"markers": "python_full_version >= '3.6.1'", "markers": "python_version >= '3.7'",
"version": "==1.0.0" "version": "==1.1.0"
}, },
"yt-dlp": { "yt-dlp": {
"hashes": [ "hashes": [
"sha256:81b50ed7cf9cfcc042d8f5a1ad2d1cd7b13c48b36c07faf1880696eac0a7ddb5", "sha256:05179f0f2c34f06910003bb9f80af68ff798b072ca0f826c0e6704a3fbd5b306",
"sha256:b0051920e066379acba6e253adba8bc1592e2ad1b7923df3a56793a4774b0cee" "sha256:68546578c18e6ce87450b53769d5d5b7f5a23e5209784976db6c7ccbf7954b21"
], ],
"index": "pypi", "index": "pypi",
"version": "==2022.2.4" "version": "==2022.3.8.2"
} }
}, },
"develop": {} "develop": {}

Wyświetl plik

@ -1,6 +1,7 @@
# we need to explicitly expose the available imports here # we need to explicitly expose the available imports here
from .base_archiver import * from .base_archiver import *
from .telegram_archiver import * from .telegram_archiver import *
from .telethon_archiver import *
from .tiktok_archiver import * from .tiktok_archiver import *
from .wayback_archiver import * from .wayback_archiver import *
from .youtubedl_archiver import * from .youtubedl_archiver import *

Wyświetl plik

@ -6,8 +6,6 @@ from dataclasses import dataclass
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from urllib.parse import urlparse from urllib.parse import urlparse
import hashlib import hashlib
from selenium.common.exceptions import TimeoutException
from loguru import logger
import time import time
import requests import requests
@ -44,18 +42,41 @@ class Archiver(ABC):
def get_netloc(self, url): def get_netloc(self, url):
return urlparse(url).netloc return urlparse(url).netloc
def generate_media_page(self, urls, url, object): def get_html_key(self, url):
headers = { return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
page = f'''<html><head><title>{url}</title></head> page = f'''<html><head><title>{url}</title></head>
<body> <body>
<h2>Archived media from {self.name}</h2> <h2>Archived media from {self.name}</h2>
<h3><a href="{url}">{url}</a></h3><ul>''' <h3><a href="{url}">{url}</a></h3><ul>'''
thumbnail = None for url_info in urls_info:
page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
page += f"</body></html>"
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = 'tmp/' + page_key
page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f:
f.write(page)
page_hash = self.get_hash(page_filename)
self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
return (page_cdn, page_hash, thumbnail)
def generate_media_page(self, urls, url, object):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
thumbnail = None
uploaded_media = []
for media_url in urls: for media_url in urls:
path = urlparse(media_url).path path = urlparse(media_url).path
key = self.get_key(path.replace("/", "_")) key = self.get_key(path.replace("/", "_"))
@ -74,26 +95,9 @@ class Archiver(ABC):
if thumbnail is None: if thumbnail is None:
thumbnail = cdn_url thumbnail = cdn_url
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
page += f'''<li><a href="{cdn_url}">{media_url}</a>: {hash}</li>''' return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
page += f"</body></html>"
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
page_filename = 'tmp/' + page_key
page_cdn = self.storage.get_cdn_url(page_key)
with open(page_filename, "w") as f:
f.write(page)
page_hash = self.get_hash(page_filename)
self.storage.upload(page_filename, page_key, extra_args={
'ACL': 'public-read', 'ContentType': 'text/html'})
return (page_cdn, page_hash, thumbnail)
def get_key(self, filename): def get_key(self, filename):
""" """
returns a key in the format "[archiverName]_[filename]" includes extension returns a key in the format "[archiverName]_[filename]" includes extension

Wyświetl plik

@ -39,7 +39,7 @@ class TelegramArchiver(Archiver):
images = [] images = []
for im in image_tags: for im in image_tags:
urls = [u.replace("'", "") for u in re.findall('url\((.*?)\)', im['style'])] urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
images += urls images += urls
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content))) page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))

Wyświetl plik

@ -0,0 +1,104 @@
import os
import re
import html
from dataclasses import dataclass
from urllib.parse import urlparse
from loguru import logger
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from telethon.sync import TelegramClient
@dataclass
class TelegramConfig:
api_id: str
api_hash: str
class TelethonArchiver(Archiver):
name = "telethon"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(.+)")
def __init__(self, storage: Storage, driver, config: TelegramConfig):
super().__init__(storage, driver)
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
"""
Searches for Telegram posts that are part of the same group of uploads
The search is conducted around the id of the original post with an amplitude
of `max_amp` both ways
Returns a list of [post] where each post has media and is in the same grouped_id
"""
if original_post.grouped_id is None:
return [original_post] if original_post.media is not None else []
search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
posts = self.client.get_messages(chat, ids=search_ids)
media = []
for post in posts:
if post.grouped_id == original_post.grouped_id and post.media is not None:
media.append(post)
return media
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
if not len(matches):
return False
status = "success"
screenshot = self.get_screenshot(url)
with self.client.start():
matches = list(matches[0])
chat, post_id = matches[1], matches[2]
post_id = int(post_id)
try:
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
logger.warning(f'Could not fetch telegram {url} possibly it\'s private: {e}')
return False
media_posts = self._get_media_posts_in_group(chat, post)
if len(media_posts) > 1:
key = self.get_html_key(url)
cdn_url = self.storage.get_cdn_url(key)
if check_if_exists and self.storage.exists(key):
status = 'already archived'
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
group_id = post.grouped_id if post.grouped_id is not None else post.id
uploaded_media = []
message = post.message
for mp in media_posts:
if len(mp.message) > message: message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1]
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
os.remove(filename)
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'tmp/{key}')
key = filename.split('tmp/')[1].replace(" ", "")
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
key_thumb, thumb_index = self.get_thumbnails(filename, key)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)

Wyświetl plik

@ -71,6 +71,10 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
key=os.getenv('DO_SPACES_KEY'), key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET') secret=os.getenv('DO_SPACES_SECRET')
) )
telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH')
)
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.headless = True options.headless = True
@ -98,6 +102,7 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
# order matters, first to succeed excludes remaining # order matters, first to succeed excludes remaining
active_archivers = [ active_archivers = [
archivers.TelethonArchiver(s3_client, driver, telegram_config),
archivers.TelegramArchiver(s3_client, driver), archivers.TelegramArchiver(s3_client, driver),
archivers.TiktokArchiver(s3_client, driver), archivers.TiktokArchiver(s3_client, driver),
archivers.YoutubeDLArchiver(s3_client, driver), archivers.YoutubeDLArchiver(s3_client, driver),

Wyświetl plik

@ -1,3 +1,4 @@
from loguru import logger
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
@ -6,14 +7,15 @@ class Storage(ABC):
def __init__(self, config): pass def __init__(self, config): pass
@abstractmethod @abstractmethod
def get_cdn_url(self, path): pass def get_cdn_url(self, key): pass
@abstractmethod @abstractmethod
def exists(self, path): pass def exists(self, key): pass
@abstractmethod @abstractmethod
def uploadf(self, file, key, **kwargs): pass def uploadf(self, file, key, **kwargs): pass
def upload(self, filename: str, key: str, **kwargs): def upload(self, filename: str, key: str, **kwargs):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs) self.uploadf(f, key, **kwargs)

Wyświetl plik

@ -1,3 +1,3 @@
# we need to explicitly expose the available imports here # we need to explicitly expose the available imports here
from .gworksheet import GWorksheet from .gworksheet import *
from .misc import * from .misc import *