diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 18e4c1b..eb508c0 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re +import os, datetime, shutil, hashlib, time, requests, re, mimetypes from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -58,7 +58,13 @@ class Archiver(ABC):

{url}

{self.name} object data:

{object}" page += f"" @@ -77,8 +83,20 @@ class Archiver(ABC): page_cdn = self.storage.get_cdn_url(page_key) return (page_cdn, page_hash, thumbnail) + def _guess_file_type(self, path:str): + """ + Receives a URL or filename and returns global mimetype like 'image' or 'video' and the specific mimetype as a tuple + see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types + ex: ('audio', 'audio/mp3') + """ + mime = mimetypes.guess_type(path)[0] + if mime is not None: + return mime.split("/")[0], mime + return "", "" + + # eg images in a tweet save to cloud storage - def generate_media_page(self, urls, url, object): + def generate_media_page(self, urls, url, object, requester=requests): """ For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them @@ -94,7 +112,7 @@ class Archiver(ABC): filename = os.path.join(Storage.TMP_FOLDER, key) - d = requests.get(media_url, headers=headers) + d = requester.get(media_url, headers=headers) with open(filename, 'wb') as f: f.write(d.content)