Merge pull request #40 from bellingcat/vk-archiver

pull/33/head
Miguel Sozinho Ramalho 2022-06-16 16:18:48 +01:00 zatwierdzone przez GitHub
commit b7f1ec5404
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
17 zmienionych plików z 357 dodań i 80 usunięć

3
.gitignore vendored
Wyświetl plik

@ -15,4 +15,5 @@ config-*.json
config.yaml
config-*.yaml
logs/*
local_archive/
local_archive/
vk_config*.json

Wyświetl plik

@ -22,6 +22,8 @@ google-auth-oauthlib = "*"
oauth2client = "*"
python-slugify = "*"
pyyaml = "*"
vk-api = "*"
dateparser = "*"
[requires]
python_version = "3.9"

166
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "602a05a8fa475181c24714ab57188a417fdfddf373a7dab4fa0ba0fcb7ce8d0a"
"sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd"
},
"pipfile-spec": 6,
"requires": {
@ -50,19 +50,19 @@
},
"boto3": {
"hashes": [
"sha256:28ab0947c49a6fb2409004d4a10b2828aec231cb95ca1d800cb1411e191cc201",
"sha256:833e67edfb73f2cc22ff27a1c33728686dc90a9e81ba2551f9462ea2d1b04f41"
"sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4",
"sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8"
],
"index": "pypi",
"version": "==1.24.8"
"version": "==1.24.9"
},
"botocore": {
"hashes": [
"sha256:ad92702930d6cb7b587fc2f619672feb74d5218f8de387a28c2905820db79027",
"sha256:db6667b8dfd175d16187653942cd91dd1f0cf36adc0ea9d7a0805ba4d2a3321f"
"sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad",
"sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6"
],
"markers": "python_version >= '3.7'",
"version": "==1.27.8"
"version": "==1.27.9"
},
"brotli": {
"hashes": [
@ -152,7 +152,7 @@
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==2022.5.18.1"
},
"cffi": {
@ -267,6 +267,14 @@
],
"version": "==37.0.2"
},
"dateparser": {
"hashes": [
"sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9",
"sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628"
],
"index": "pypi",
"version": "==1.1.1"
},
"ffmpeg-python": {
"hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@ -303,7 +311,7 @@
"sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0",
"sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==2.8.1"
},
"google-api-python-client": {
@ -316,11 +324,11 @@
},
"google-auth": {
"hashes": [
"sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1",
"sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475"
"sha256:819b70140d05501739e1387291d39f0de3b4dff3b00ae4aff8e7a05369957f89",
"sha256:9b1da39ab8731c3061f36fefde9f8bb902dbee9eb28e3a67e8cfa7dc1be76227"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.7.0"
"version": "==2.8.0"
},
"google-auth-httplib2": {
"hashes": [
@ -343,7 +351,7 @@
"sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c",
"sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==1.56.2"
},
"gspread": {
@ -359,7 +367,7 @@
"sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06",
"sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==0.13.0"
},
"httplib2": {
@ -554,7 +562,7 @@
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==3.2.0"
},
"outcome": {
@ -682,7 +690,7 @@
"sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb",
"sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==2.12.0"
},
"pyopenssl": {
@ -724,6 +732,21 @@
"index": "pypi",
"version": "==6.1.2"
},
"pytz": {
"hashes": [
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
],
"version": "==2022.1"
},
"pytz-deprecation-shim": {
"hashes": [
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==0.1.0.post0"
},
"pyyaml": {
"hashes": [
"sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293",
@ -763,7 +786,88 @@
"index": "pypi",
"version": "==6.0"
},
"regex": {
"hashes": [
"sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14",
"sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9",
"sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204",
"sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f",
"sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737",
"sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b",
"sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3",
"sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4",
"sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac",
"sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f",
"sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29",
"sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772",
"sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1",
"sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863",
"sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66",
"sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed",
"sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47",
"sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f",
"sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f",
"sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008",
"sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d",
"sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571",
"sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0",
"sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a",
"sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3",
"sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7",
"sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447",
"sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493",
"sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4",
"sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede",
"sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640",
"sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd",
"sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c",
"sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee",
"sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30",
"sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b",
"sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec",
"sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1",
"sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e",
"sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8",
"sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9",
"sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231",
"sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7",
"sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729",
"sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960",
"sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056",
"sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357",
"sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7",
"sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3",
"sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7",
"sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573",
"sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0",
"sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178",
"sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f",
"sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834",
"sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c",
"sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015",
"sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0",
"sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57",
"sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635",
"sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07",
"sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2",
"sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1",
"sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b",
"sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2",
"sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5",
"sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b",
"sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86",
"sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5",
"sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93",
"sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0",
"sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f",
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2022.3.2"
},
"requests": {
"extras": [],
"hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
@ -799,7 +903,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4'",
"markers": "python_version < '4' and python_full_version >= '3.6.0'",
"version": "==4.8"
},
"s3transfer": {
@ -853,7 +957,7 @@
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==2.3.2.post1"
},
"telethon": {
@ -902,12 +1006,28 @@
"markers": "python_version >= '3.5'",
"version": "==0.9.2"
},
"tzdata": {
"hashes": [
"sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9",
"sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2022.1"
},
"tzlocal": {
"hashes": [
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==4.2"
},
"uritemplate": {
"hashes": [
"sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
"sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.6.0'",
"version": "==4.1.1"
},
"urllib3": {
@ -922,6 +1042,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.9"
},
"vk-api": {
"hashes": [
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc",
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3"
],
"index": "pypi",
"version": "==11.9.8"
},
"websockets": {
"hashes": [
"sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",

Wyświetl plik

@ -140,7 +140,7 @@ With this configuration, the archiver should archive and store all media added t
# auto_auto_archiver
To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) This script takes one command line argument, with `--sheet`, the name of the sheet. It must be shared with the same service account.
To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) You can simply use your default config as for `auto_archiver.py` but use `--sheet` to specify the name of the sheet that lists the names of sheets to archive.It must be shared with the same service account.
![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png)
@ -152,15 +152,16 @@ Code is split into functional concepts:
1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
### Current Archivers
Archivers are tested in a meaningful order with Wayback Machine being the default, that can easily be changed in the code.
Archivers are tested in a meaningful order with Wayback Machine being the failsafe, that can easily be changed in the code.
```mermaid
graph TD
A(Archiver) -->|parent of| B(YoutubeDLArchiver)
A -->|parent of| C(TikTokArchiver)
A -->|parent of| D(TwitterArchiver)
A(Archiver) -->|parent of| B(TelethonArchiver)
A -->|parent of| C(TiktokArchiver)
A -->|parent of| D(YoutubeDLArchiver)
A -->|parent of| E(TelegramArchiver)
A -->|parent of| F(TelethonArchiver)
A -->|parent of| G(WaybackArchiver)
A -->|parent of| F(TwitterArchiver)
A -->|parent of| G(VkArchiver)
A -->|parent of| H(WaybackArchiver)
```
### Current Storages
```mermaid

Wyświetl plik

@ -5,4 +5,5 @@ from .telethon_archiver import TelethonArchiver
from .tiktok_archiver import TiktokArchiver
from .wayback_archiver import WaybackArchiver
from .youtubedl_archiver import YoutubeDLArchiver
from .twitter_archiver import TwitterArchiver
from .twitter_archiver import TwitterArchiver
from .vk_archiver import VkArchiver

Wyświetl plik

@ -1,4 +1,4 @@
import os, datetime, shutil, hashlib, time, requests, re
import os, datetime, shutil, hashlib, time, requests, re, mimetypes
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
@ -58,7 +58,13 @@ class Archiver(ABC):
<h3><a href="{url}">{url}</a></h3><ul>'''
for url_info in urls_info:
page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''
mime_global = self._guess_file_type(url_info["key"])
preview = ""
if mime_global == "image":
preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
elif mime_global == "video":
preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''
page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
page += f"</body></html>"
@ -77,7 +83,18 @@ class Archiver(ABC):
page_cdn = self.storage.get_cdn_url(page_key)
return (page_cdn, page_hash, thumbnail)
def _guess_file_type(self, path: str):
"""
Receives a URL or filename and returns global mimetype like 'image' or 'video'
see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
"""
mime = mimetypes.guess_type(path)[0]
if mime is not None:
return mime.split("/")[0]
return ""
# eg images in a tweet save to cloud storage
def generate_media_page(self, urls, url, object):
"""
For a list of media urls, fetch them, upload them
@ -208,12 +225,11 @@ class Archiver(ABC):
key = key_folder + fname
self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)
cdn_urls.append(cdn_url)
if len(cdn_urls) == 0:
return ('None', 'None')
return ('', '')
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]

Wyświetl plik

@ -53,7 +53,6 @@ class TelegramArchiver(Archiver):
key = self.get_key(video_id)
filename = os.path.join(Storage.TMP_FOLDER, key)
cdn_url = self.storage.get_cdn_url(key)
if check_if_exists and self.storage.exists(key):
status = 'already archived'
@ -84,5 +83,6 @@ class TelegramArchiver(Archiver):
filename, key, duration=duration)
os.remove(filename)
cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot)

Wyświetl plik

@ -8,6 +8,7 @@ from telethon.errors import ChannelInvalidError
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import TelethonConfig
from utils import getattr_or
class TelethonArchiver(Archiver):
@ -16,8 +17,9 @@ class TelethonArchiver(Archiver):
def __init__(self, storage: Storage, driver, config: TelethonConfig):
super().__init__(storage, driver)
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
self.bot_token = config.bot_token
if config:
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
self.bot_token = config.bot_token
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
"""
@ -26,8 +28,8 @@ class TelethonArchiver(Archiver):
of `max_amp` both ways
Returns a list of [post] where each post has media and is in the same grouped_id
"""
if original_post.grouped_id is None:
return [original_post] if original_post.media is not None else []
if getattr_or(original_post, "grouped_id") is None:
return [original_post] if getattr_or(original_post, "media") else []
search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
posts = self.client.get_messages(chat, ids=search_ids)
@ -38,6 +40,10 @@ class TelethonArchiver(Archiver):
return media
def download(self, url, check_if_exists=False):
if not hasattr(self, "client"):
logger.error('Missing Telethon config')
return False
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
if not len(matches):
@ -61,12 +67,14 @@ class TelethonArchiver(Archiver):
logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}")
return False
if post is None: return False
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
screenshot = self.get_screenshot(url)
if len(media_posts) > 1:
if len(media_posts) > 0:
key = self.get_html_key(url)
if check_if_exists and self.storage.exists(key):
@ -78,7 +86,7 @@ class TelethonArchiver(Archiver):
group_id = post.grouped_id if post.grouped_id is not None else post.id
uploaded_media = []
message = post.message
for mp in media_posts:
for i, mp in enumerate(media_posts):
if len(mp.message) > len(message): message = mp.message
filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id))
filename = self.client.download_media(mp.media, filename_dest)
@ -87,22 +95,13 @@ class TelethonArchiver(Archiver):
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
if i == 0:
key_thumb, thumb_index = self.get_thumbnails(filename, key)
os.remove(filename)
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, os.path.join(Storage.TMP_FOLDER, key))
key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "")
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
key_thumb, thumb_index = self.get_thumbnails(filename, key)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot)

Wyświetl plik

@ -0,0 +1,89 @@
import re, json, requests
import vk_api, dateparser
from bs4 import BeautifulSoup
from loguru import logger
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import VkConfig
class VkArchiver(Archiver):
""""
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
"""
name = "vk"
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
onclick_pattern = re.compile(r"({.*})")
def __init__(self, storage: Storage, driver, config: VkConfig):
super().__init__(storage, driver)
if config != None:
self.vk_session = vk_api.VkApi(config.username, config.password)
self.vk_session.auth(token_only=True)
def download(self, url, check_if_exists=False):
# detect URLs that this archiver can handle
_id, method = None, None
if has_wall := self.wall_pattern.search(url):
_id = has_wall[0]
method = self.archive_wall
elif has_photo := self.photo_pattern.search(url):
_id = has_photo[0]
method = self.archive_photo
else: return False
logger.info(f"found valid {_id=} from {url=}")
proper_url = f'https://vk.com/{_id}'
# if check if exists will not download again
key = self.get_html_key(proper_url)
if check_if_exists and self.storage.exists(key):
screenshot = self.get_screenshot(proper_url)
cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
try:
return method(proper_url, _id)
except Exception as e:
logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}")
return False
def archive_photo(self, photo_url, photo_id):
headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version}
req = requests.get("https://api.vk.com/method/photos.getById", headers)
res = req.json()["response"][0]
title = res["text"][:200] # more on the page
img_url = res["orig_photo"]["url"]
time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res)
screenshot = self.get_screenshot(photo_url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
def archive_wall(self, wall_url, wall_id):
headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version}
req = requests.get("https://api.vk.com/method/wall.getById", headers)
res = req.json()["response"]
wall = res["items"][0]
img_urls = []
if "attachments" in wall:
for a in wall["attachments"]:
attachment = a[a["type"]]
if "thumb" in attachment:
attachment = attachment["thumb"]
if "sizes" in attachment:
try: img_urls.append(attachment["sizes"][-1]["url"])
except Exception as e:
logger.warning(f"could not get image from attachment: {e}")
title = wall["text"][:200] # more on the page
time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res)
screenshot = self.get_screenshot(wall_url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)

Wyświetl plik

@ -18,10 +18,12 @@ class WaybackArchiver(Archiver):
def __init__(self, storage: Storage, driver, config: WaybackConfig):
super(WaybackArchiver, self).__init__(storage, driver)
self.config = config
# TODO: this logic should live at the auto-archiver level
self.seen_urls = {}
def download(self, url, check_if_exists=False):
if self.config is None:
logger.error('Missing Wayback config')
return False
if check_if_exists:
if url in self.seen_urls: return self.seen_urls[url]
@ -57,7 +59,7 @@ class WaybackArchiver(Archiver):
retries += 1
if status_r.status_code != 200:
return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
status_json = status_r.json()
if status_json['status'] != 'success':

Wyświetl plik

@ -106,11 +106,11 @@ class YoutubeDLArchiver(Archiver):
os.remove(filename)
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \
if 'timestamp' in info else \
datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \
if 'upload_date' in info and info['upload_date'] is not None else \
None
timestamp = None
if 'timestamp' in info and info['timestamp'] is not None:
timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat()
elif 'upload_date' in info and info['upload_date'] is not None:
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)

Wyświetl plik

@ -3,7 +3,7 @@ import os, datetime, shutil, traceback, random
from loguru import logger
from slugify import slugify
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
from utils import GWorksheet, mkdir_if_not_exists, expand_url
from configs import Config
from storages import Storage
@ -95,6 +95,7 @@ def process_sheet(c: Config):
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver),
VkArchiver(storage, c.webdriver, c.vk_config),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]

Wyświetl plik

@ -1,29 +1,30 @@
import gspread
import argparse
import shutil
import auto_archive
from loguru import logger
from configs import Config
from storages import Storage
from utils import mkdir_if_not_exists
def main():
parser = argparse.ArgumentParser(
description="Automatically use youtube-dl to download media from a Google Sheet")
parser.add_argument("--sheet", action="store", dest="sheet")
c = Config()
c.parse()
logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
args = parser.parse_args()
logger.info("Opening document " + args.sheet)
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(args.sheet)
gc = c.gsheets_client
sh = gc.open(c.sheet)
wks = sh.get_worksheet(0)
values = wks.get_all_values()
mkdir_if_not_exists(Storage.TMP_FOLDER)
for i in range(11, len(values)):
sheet_name = values[i][0]
c.sheet = values[i][0]
logger.info(f"Processing {c.sheet}")
auto_archive.process_sheet(c)
c.destroy_webdriver()
shutil.rmtree(Storage.TMP_FOLDER)
logger.info("Processing " + sheet_name)
auto_archive.process_sheet(sheet_name)
if __name__ == "__main__":
main()

Wyświetl plik

@ -1,4 +1,5 @@
from .config import Config
from .selenium_config import SeleniumConfig
from .telethon_config import TelethonConfig
from .wayback_config import WaybackConfig
from .wayback_config import WaybackConfig
from .vk_config import VkConfig

Wyświetl plik

@ -4,11 +4,13 @@ import gspread
from loguru import logger
from selenium import webdriver
from dataclasses import asdict
from selenium.common.exceptions import TimeoutException
from utils import GWorksheet, getattr_or
from .wayback_config import WaybackConfig
from .telethon_config import TelethonConfig
from .selenium_config import SeleniumConfig
from .vk_config import VkConfig
from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
@ -120,6 +122,7 @@ class Config:
secret=secrets["wayback"]["secret"],
)
else:
self.wayback_config = None
logger.debug(f"'wayback' key not present in the {self.config_file=}")
# telethon config
@ -130,8 +133,19 @@ class Config:
bot_token=secrets["telegram"].get("bot_token", None)
)
else:
self.telegram_config = None
logger.debug(f"'telegram' key not present in the {self.config_file=}")
# vk config
if "vk" in secrets:
self.vk_config = VkConfig(
username=secrets["vk"]["username"],
password=secrets["vk"]["password"]
)
else:
self.vk_config = None
logger.debug(f"'vk' key not present in the {self.config_file=}")
del self.config["secrets"] # delete to prevent leaks
def set_log_files(self):
@ -197,16 +211,23 @@ class Config:
def destroy_webdriver(self):
if self.webdriver is not None and type(self.webdriver) != str:
self.webdriver.quit()
del self.webdriver
def recreate_webdriver(self):
self.destroy_webdriver()
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
self.webdriver = webdriver.Firefox(options=options)
self.webdriver.set_window_size(self.selenium_config.window_width,
try:
new_webdriver = webdriver.Firefox(options=options)
# only destroy if creation is successful
self.destroy_webdriver()
self.webdriver = new_webdriver
self.webdriver.set_window_size(self.selenium_config.window_width,
self.selenium_config.window_height)
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
def __str__(self) -> str:
return json.dumps({
@ -225,6 +246,7 @@ class Config:
"local_config": hasattr(self, "local_config"),
"wayback_config": self.wayback_config != None,
"telegram_config": self.telegram_config != None,
"vk_config": self.vk_config != None,
"gsheets_client": self.gsheets_client != None,
"column_names": self.column_names,
}, ensure_ascii=False, indent=4)

Wyświetl plik

@ -0,0 +1,8 @@
from dataclasses import dataclass
@dataclass
class VkConfig:
username: str
password: str

Wyświetl plik

@ -39,6 +39,11 @@ secrets:
# optional, but allows access to more content such as large videos, talk to @botfather
bot_token: your bot-token
# vkontakte (vk.com) credentials
vk:
username: "phone number or email"
password: "password"
google_sheets:
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
service_account: "service_account.json"