Merge pull request #44 from bellingcat/vk-url-lib

pull/50/head
Miguel Sozinho Ramalho 2022-06-21 14:40:08 +01:00 zatwierdzone przez GitHub
commit 76b531c56a
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
5 zmienionych plików z 103 dodań i 105 usunięć

Wyświetl plik

@ -22,8 +22,8 @@ google-auth-oauthlib = "*"
oauth2client = "*"
python-slugify = "*"
pyyaml = "*"
vk-api = "*"
dateparser = "*"
vk-url-scraper = "*"
[requires]
python_version = "3.9"

79
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd"
"sha256": "eacd9633c33d4d526d7737fc6bf83ab713205f28f819530f549378fbd14da3d8"
},
"pipfile-spec": 6,
"requires": {
@ -50,19 +50,19 @@
},
"boto3": {
"hashes": [
"sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4",
"sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8"
"sha256:13efff22f1cb6d25ec7027edaccdfdd515ba593e093173beb09094cff898a8cc",
"sha256:945d49941541a3cbb02710361be64b22f98e68c2e447229f0d51f7c215009e28"
],
"index": "pypi",
"version": "==1.24.9"
"version": "==1.24.13"
},
"botocore": {
"hashes": [
"sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad",
"sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6"
"sha256:df75e53576b061818bbce4bd70221749e40cc91d16a2b6c03fbeec8023665734",
"sha256:fbc09558c02d415e8646520f95db7e8d313460938780fa6040b00865f098fd55"
],
"markers": "python_version >= '3.7'",
"version": "==1.27.9"
"version": "==1.27.13"
},
"brotli": {
"hashes": [
@ -149,11 +149,11 @@
},
"certifi": {
"hashes": [
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2022.5.18.1"
"markers": "python_version >= '3.6'",
"version": "==2022.6.15"
},
"cffi": {
"hashes": [
@ -308,11 +308,11 @@
},
"google-api-core": {
"hashes": [
"sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0",
"sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359"
"sha256:06f7244c640322b508b125903bb5701bebabce8832f85aba9335ec00b3d02edc",
"sha256:93c6a91ccac79079ac6bbf8b74ee75db970cc899278b97d53bc012f35908cf50"
],
"markers": "python_full_version >= '3.6.0'",
"version": "==2.8.1"
"markers": "python_version >= '3.6'",
"version": "==2.8.2"
},
"google-api-python-client": {
"hashes": [
@ -351,7 +351,7 @@
"sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c",
"sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==1.56.2"
},
"gspread": {
@ -367,7 +367,7 @@
"sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06",
"sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==0.13.0"
},
"httplib2": {
@ -412,11 +412,11 @@
},
"jmespath": {
"hashes": [
"sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e",
"sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04"
"sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980",
"sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"
],
"markers": "python_version >= '3.7'",
"version": "==1.0.0"
"version": "==1.0.1"
},
"loguru": {
"hashes": [
@ -562,7 +562,7 @@
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==3.2.0"
},
"outcome": {
@ -690,7 +690,7 @@
"sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb",
"sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==2.12.0"
},
"pyopenssl": {
@ -724,6 +724,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2"
},
"python-dotenv": {
"hashes": [
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
],
"markers": "python_version >= '3.5'",
"version": "==0.20.0"
},
"python-slugify": {
"hashes": [
"sha256:272d106cb31ab99b3496ba085e3fea0e9e76dcde967b5e9992500d1f785ce4e1",
@ -863,11 +871,10 @@
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==2022.3.2"
},
"requests": {
"extras": [],
"hashes": [
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
@ -903,7 +910,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version < '4' and python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.8"
},
"s3transfer": {
@ -957,7 +964,7 @@
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==2.3.2.post1"
},
"telethon": {
@ -1011,7 +1018,7 @@
"sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9",
"sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==2022.1"
},
"tzlocal": {
@ -1019,7 +1026,7 @@
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==4.2"
},
"uritemplate": {
@ -1027,14 +1034,11 @@
"sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
"sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
],
"markers": "python_full_version >= '3.6.0'",
"markers": "python_version >= '3.6'",
"version": "==4.1.1"
},
"urllib3": {
"extras": [
"secure",
"socks"
],
"extras": [],
"hashes": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
@ -1047,9 +1051,16 @@
"sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc",
"sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3"
],
"index": "pypi",
"version": "==11.9.8"
},
"vk-url-scraper": {
"hashes": [
"sha256:181c8a4b69e395a68bdf00e3dc1717e5218960c9fda6e90eea9633ff26fc9257",
"sha256:9cfc6bc3d7259f18508c3822955efac21ff9bad5bd886010b10f098ea10ad551"
],
"index": "pypi",
"version": "==0.3.2"
},
"websockets": {
"hashes": [
"sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",

Wyświetl plik

@ -197,8 +197,8 @@ class Archiver(ABC):
return self.storage.get_cdn_url(key)
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = filename.split('.')[0] + '/'
key_folder = key.split('.')[0] + '/'
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
key_folder = key.split('.')[0] + os.path.sep
mkdir_if_not_exists(thumbnails_folder)
@ -222,7 +222,7 @@ class Archiver(ABC):
for fname in thumbnails:
if fname[-3:] == 'jpg':
thumbnail_filename = thumbnails_folder + fname
key = key_folder + fname
key = os.path.join(key_folder, fname)
self.storage.upload(thumbnail_filename, key)
cdn_url = self.storage.get_cdn_url(key)

Wyświetl plik

@ -1,8 +1,7 @@
import re, json, requests
import re, json, mimetypes, os
import vk_api, dateparser
from bs4 import BeautifulSoup
from loguru import logger
from vk_url_scraper import VkScraper, DateTimeEncoder
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
@ -17,73 +16,58 @@ class VkArchiver(Archiver):
name = "vk"
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
onclick_pattern = re.compile(r"({.*})")
def __init__(self, storage: Storage, driver, config: VkConfig):
super().__init__(storage, driver)
if config != None:
self.vk_session = vk_api.VkApi(config.username, config.password)
self.vk_session.auth(token_only=True)
self.vks = VkScraper(config.username, config.password)
def download(self, url, check_if_exists=False):
# detect URLs that this archiver can handle
_id, method = None, None
if has_wall := self.wall_pattern.search(url):
_id = has_wall[0]
method = self.archive_wall
elif has_photo := self.photo_pattern.search(url):
_id = has_photo[0]
method = self.archive_photo
else: return False
if not hasattr(self, "vks") or self.vks is None:
logger.debug("VK archiver was not supplied with credentials.")
return False
logger.info(f"found valid {_id=} from {url=}")
proper_url = f'https://vk.com/{_id}'
key = self.get_html_key(url)
# if check_if_exists and self.storage.exists(key):
# screenshot = self.get_screenshot(url)
# cdn_url = self.storage.get_cdn_url(key)
# return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
# if check if exists will not download again
key = self.get_html_key(proper_url)
if check_if_exists and self.storage.exists(key):
screenshot = self.get_screenshot(proper_url)
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
if len(results) == 0:
return False
def dump_payload(p): return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
textual_output = ""
title, datetime = results[0]["text"], results[0]["datetime"]
urls_found = []
for res in results:
textual_output += f"id: {res['id']}<br>time utc: {res['datetime']}<br>text: {res['text']}<br>payload: {dump_payload(res['payload'])}<br><hr/><br>"
title = res["text"] if len(title) == 0 else title
datetime = res["datetime"] if not datetime else datetime
for attachments in res["attachments"].values():
urls_found.extend(attachments)
# we don't call generate_media_page which downloads urls because it cannot download vk video urls
thumbnail, thumbnail_index = None, None
uploaded_media = []
filenames = self.vks.download_media(results, Storage.TMP_FOLDER)
for filename in filenames:
key = self.get_key(filename)
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
try:
_type = mimetypes.guess_type(filename)[0].split("/")[0]
if _type == "image" and thumbnail is None:
thumbnail = cdn_url
if _type == "video" and (thumbnail is None or thumbnail_index is None):
thumbnail, thumbnail_index = self.get_thumbnails(filename, key)
except Exception as e:
logger.warning(f"failed to get thumb for {filename=} with {e=}")
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
try:
return method(proper_url, _id)
except Exception as e:
logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}")
return False
def archive_photo(self, photo_url, photo_id):
headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version}
req = requests.get("https://api.vk.com/method/photos.getById", headers)
res = req.json()["response"][0]
title = res["text"][:200] # more on the page
img_url = res["orig_photo"]["url"]
time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res)
screenshot = self.get_screenshot(photo_url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
def archive_wall(self, wall_url, wall_id):
headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version}
req = requests.get("https://api.vk.com/method/wall.getById", headers)
res = req.json()["response"]
wall = res["items"][0]
img_urls = []
if "attachments" in wall:
for a in wall["attachments"]:
attachment = a[a["type"]]
if "thumb" in attachment:
attachment = attachment["thumb"]
if "sizes" in attachment:
try: img_urls.append(attachment["sizes"][-1]["url"])
except Exception as e:
logger.warning(f"could not get image from attachment: {e}")
title = wall["text"][:200] # more on the page
time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"})
page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res)
screenshot = self.get_screenshot(wall_url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title)
page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail)
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title)

Wyświetl plik

@ -1,5 +1,6 @@
import os, sys, requests
import os, json, requests
from datetime import datetime
from loguru import logger
@ -19,10 +20,12 @@ def expand_url(url):
logger.error(f'Failed to expand url {url}')
return url
def getattr_or(o: object, prop: str, default = None):
try:
def getattr_or(o: object, prop: str, default=None):
try:
res = getattr(o, prop)
if res is None: raise
return res
except:
return default
return default