diff --git a/Pipfile b/Pipfile index 1b55d86..a14995b 100644 --- a/Pipfile +++ b/Pipfile @@ -22,8 +22,8 @@ google-auth-oauthlib = "*" oauth2client = "*" python-slugify = "*" pyyaml = "*" -vk-api = "*" dateparser = "*" +vk-url-scraper = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 0b911f3..172df30 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd" + "sha256": "eacd9633c33d4d526d7737fc6bf83ab713205f28f819530f549378fbd14da3d8" }, "pipfile-spec": 6, "requires": { @@ -50,19 +50,19 @@ }, "boto3": { "hashes": [ - "sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4", - "sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8" + "sha256:0b9757575b8003928defc5fb6e816936fa1bdb1384d0edec6622bb9fb104e96c", + "sha256:f39b91a4c3614db8e44912ee82426fb4b16d5df2cd66883f3aff6f76d7f5d310" ], "index": "pypi", - "version": "==1.24.9" + "version": "==1.24.12" }, "botocore": { "hashes": [ - "sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad", - "sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6" + "sha256:17d3ec9f684d21e06b64d9cb224934557bcd95031e2ecb551bf16271e8722fec", + "sha256:b8ac156e55267da6e728ea0b806bfcd97adf882801cffe7849c4b88ce4780326" ], "markers": "python_version >= '3.7'", - "version": "==1.27.9" + "version": "==1.27.12" }, "brotli": { "hashes": [ @@ -149,11 +149,11 @@ }, "certifi": { "hashes": [ - "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", - "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" + "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", + "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" ], - "markers": "python_full_version >= '3.6.0'", - "version": "==2022.5.18.1" + "markers": "python_version >= '3.6'", + "version": "==2022.6.15" }, "cffi": { "hashes": [ @@ -308,11 +308,11 @@ }, "google-api-core": { "hashes": [ - "sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0", - "sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359" + "sha256:06f7244c640322b508b125903bb5701bebabce8832f85aba9335ec00b3d02edc", + "sha256:93c6a91ccac79079ac6bbf8b74ee75db970cc899278b97d53bc012f35908cf50" ], - "markers": "python_full_version >= '3.6.0'", - "version": "==2.8.1" + "markers": "python_version >= '3.6'", + "version": "==2.8.2" }, "google-api-python-client": { "hashes": [ @@ -351,7 +351,7 @@ "sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c", "sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==1.56.2" }, "gspread": { @@ -367,7 +367,7 @@ "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06", "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==0.13.0" }, "httplib2": { @@ -412,11 +412,11 @@ }, "jmespath": { "hashes": [ - "sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e", - "sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04" + "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", + "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe" ], "markers": "python_version >= '3.7'", - "version": "==1.0.0" + "version": "==1.0.1" }, "loguru": { "hashes": [ @@ -562,7 +562,7 @@ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==3.2.0" }, "outcome": { @@ -690,7 +690,7 @@ "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb", "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2.12.0" }, "pyopenssl": { @@ -724,6 +724,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, + "python-dotenv": { + "hashes": [ + "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f", + "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938" + ], + "markers": "python_version >= '3.5'", + "version": "==0.20.0" + }, "python-slugify": { "hashes": [ "sha256:272d106cb31ab99b3496ba085e3fea0e9e76dcde967b5e9992500d1f785ce4e1", @@ -863,11 +871,13 @@ "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2022.3.2" }, "requests": { - "extras": [], + "extras": [ + "socks" + ], "hashes": [ "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" @@ -903,7 +913,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version < '4' and python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -957,7 +967,7 @@ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2.3.2.post1" }, "telethon": { @@ -1011,7 +1021,7 @@ "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9", "sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2022.1" }, "tzlocal": { @@ -1019,7 +1029,7 @@ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==4.2" }, "uritemplate": { @@ -1027,7 +1037,7 @@ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==4.1.1" }, "urllib3": { @@ -1047,9 +1057,16 @@ "sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc", "sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3" ], - "index": "pypi", "version": "==11.9.8" }, + "vk-url-scraper": { + "hashes": [ + "sha256:de74b161e8bae153160e1a6f0521457cb38a02a91e1dc598a41aef236d966b70", + "sha256:fcd2ec01ac217f5257252cd83b1fccb2af59121c757b60488ba0e125d8bac7f1" + ], + "index": "pypi", + "version": "==0.1.5" + }, "websockets": { "hashes": [ "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af", diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index e48e9ef..0f2c0ac 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -1,8 +1,8 @@ -import re, json, requests +import re, json -import vk_api, dateparser -from bs4 import BeautifulSoup from loguru import logger +from utils.misc import DateTimeEncoder +from vk_url_scraper import VkScraper from storages import Storage from .base_archiver import Archiver, ArchiveResult @@ -17,73 +17,40 @@ class VkArchiver(Archiver): name = "vk" wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - onclick_pattern = re.compile(r"({.*})") def __init__(self, storage: Storage, driver, config: VkConfig): super().__init__(storage, driver) if config != None: - self.vk_session = vk_api.VkApi(config.username, config.password) - self.vk_session.auth(token_only=True) + self.vks = VkScraper(config.username, config.password) def download(self, url, check_if_exists=False): - # detect URLs that this archiver can handle - _id, method = None, None - if has_wall := self.wall_pattern.search(url): - _id = has_wall[0] - method = self.archive_wall - elif has_photo := self.photo_pattern.search(url): - _id = has_photo[0] - method = self.archive_photo - else: return False + if not hasattr(self, "vks") or self.vks is None: + logger.debug("VK archiver was not supplied with credentials.") + return False - logger.info(f"found valid {_id=} from {url=}") - proper_url = f'https://vk.com/{_id}' - - # if check if exists will not download again - key = self.get_html_key(proper_url) + key = self.get_html_key(url) if check_if_exists and self.storage.exists(key): - screenshot = self.get_screenshot(proper_url) + screenshot = self.get_screenshot(url) cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) - try: - return method(proper_url, _id) - except Exception as e: - logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}") - return False + results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched + if len(results) == 0: + return False - def archive_photo(self, photo_url, photo_id): - headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} - req = requests.get("https://api.vk.com/method/photos.getById", headers) - res = req.json()["response"][0] - title = res["text"][:200] # more on the page - img_url = res["orig_photo"]["url"] - time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) - page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res) - screenshot = self.get_screenshot(photo_url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) - - def archive_wall(self, wall_url, wall_id): - headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version} - req = requests.get("https://api.vk.com/method/wall.getById", headers) - res = req.json()["response"] - wall = res["items"][0] - img_urls = [] - if "attachments" in wall: - for a in wall["attachments"]: - attachment = a[a["type"]] - if "thumb" in attachment: - attachment = attachment["thumb"] - if "sizes" in attachment: - try: img_urls.append(attachment["sizes"][-1]["url"]) - except Exception as e: - logger.warning(f"could not get image from attachment: {e}") - - - title = wall["text"][:200] # more on the page - time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) - - page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) - screenshot = self.get_screenshot(wall_url) + dump_payload = lambda p : json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder) + textual_output = "" + title, time = results[0]["text"], results[0]["datetime"] + urls_found = [] + for res in results: + textual_output+= f"id: {res['id']}
time utc: {res['datetime']}
text: {res['text']}
payload: {dump_payload(res['payload'])}


" + title = res["text"] if len(title) == 0 else title + time = res["datetime"] if not time else time + for attachments in res["attachments"].values(): + urls_found.extend(attachments) + + page_cdn, page_hash, thumbnail = self.generate_media_page(urls_found, url, textual_output) + # if multiple wall/photos/videos are present the screenshot will only grab the 1st + screenshot = self.get_screenshot(url) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) diff --git a/utils/misc.py b/utils/misc.py index cd02c21..55a72f5 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -1,5 +1,6 @@ -import os, sys, requests +import os, json, requests +from datetime import datetime from loguru import logger @@ -19,10 +20,19 @@ def expand_url(url): logger.error(f'Failed to expand url {url}') return url -def getattr_or(o: object, prop: str, default = None): - try: + +def getattr_or(o: object, prop: str, default=None): + try: res = getattr(o, prop) if res is None: raise return res except: - return default \ No newline at end of file + return default + + +class DateTimeEncoder(json.JSONEncoder): + # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder) + def default(self, o): + if isinstance(o, datetime): + return str(o) # with timezone + return json.JSONEncoder.default(self, o)