feat: WACZ enricher can now be probed for media, and used as an archiver OR enricher

pull/87/head
msramalho 2023-07-27 15:42:10 +01:00
rodzic 65e3c99483
commit dd034da844
8 zmienionych plików z 657 dodań i 579 usunięć

Wyświetl plik

@ -36,6 +36,7 @@ uwsgi = "*"
requests = {extras = ["socks"], version = "*"} requests = {extras = ["socks"], version = "*"}
# wacz = "==0.4.8" # wacz = "==0.4.8"
numpy = "*" numpy = "*"
warcio = "*"
[requires] [requires]
python_version = "3.10" python_version = "3.10"

1094
Pipfile.lock wygenerowano

Plik diff jest za duży Load Diff

Wyświetl plik

@ -12,12 +12,13 @@ steps:
# - tiktok_archiver # - tiktok_archiver
- youtubedl_archiver - youtubedl_archiver
# - wayback_archiver_enricher # - wayback_archiver_enricher
# - wacz_archiver_enricher
enrichers: enrichers:
- hash_enricher - hash_enricher
# - screenshot_enricher # - screenshot_enricher
# - thumbnail_enricher # - thumbnail_enricher
# - wayback_archiver_enricher # - wayback_archiver_enricher
# - wacz_enricher # - wacz_archiver_enricher
# - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher # - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
formatter: html_formatter # defaults to mute_formatter formatter: html_formatter # defaults to mute_formatter
storages: storages:
@ -95,7 +96,7 @@ configurations:
secret: "wayback secret" secret: "wayback secret"
hash_enricher: hash_enricher:
algorithm: "SHA3-512" # can also be SHA-256 algorithm: "SHA3-512" # can also be SHA-256
wacz_enricher: wacz_archiver_enricher:
profile: secrets/profile.tar.gz profile: secrets/profile.tar.gz
local_storage: local_storage:
save_to: "./local_archive" save_to: "./local_archive"

Wyświetl plik

@ -3,6 +3,6 @@ from .screenshot_enricher import ScreenshotEnricher
from .wayback_enricher import WaybackArchiverEnricher from .wayback_enricher import WaybackArchiverEnricher
from .hash_enricher import HashEnricher from .hash_enricher import HashEnricher
from .thumbnail_enricher import ThumbnailEnricher from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczEnricher from .wacz_enricher import WaczArchiverEnricher
from .whisper_enricher import WhisperEnricher from .whisper_enricher import WhisperEnricher
from .pdq_hash_enricher import PdqHashEnricher from .pdq_hash_enricher import PdqHashEnricher

Wyświetl plik

@ -1,16 +1,23 @@
import mimetypes
import os, shutil, subprocess, uuid import os, shutil, subprocess, uuid
from zipfile import ZipFile
from loguru import logger from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
from . import Enricher from . import Enricher
from ..archivers import Archiver
from ..utils import UrlUtil from ..utils import UrlUtil
class WaczEnricher(Enricher): class WaczArchiverEnricher(Enricher, Archiver):
""" """
Submits the current URL to the webarchive and returns a job_id or completed archive Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL
If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
it can become quite powerful for archiving private content.
When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
""" """
name = "wacz_enricher" name = "wacz_archiver_enricher"
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called # without this STEP.__init__ is not called
@ -20,12 +27,24 @@ class WaczEnricher(Enricher):
def configs() -> dict: def configs() -> dict:
return { return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"}, "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."}, "extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
} }
def download(self, item: Metadata) -> Metadata:
# this new Metadata object is required to avoid duplication
result = Metadata()
result.merge(item)
if self.enrich(result):
return result.success("wacz")
def enrich(self, to_enrich: Metadata) -> bool: def enrich(self, to_enrich: Metadata) -> bool:
if to_enrich.get_media_by_id("browsertrix"):
logger.info(f"WACZ enricher had already been executed: {to_enrich.get_media_by_id('browsertrix')}")
return True
url = to_enrich.get_url() url = to_enrich.get_url()
logger.warning(f"ENRICHING WACZ for {url=}")
collection = str(uuid.uuid4())[0:8] collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir()) browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
@ -79,8 +98,6 @@ class WaczEnricher(Enricher):
logger.error(f"WACZ generation failed: {e}") logger.error(f"WACZ generation failed: {e}")
return False return False
if os.getenv('RUNNING_IN_DOCKER'): if os.getenv('RUNNING_IN_DOCKER'):
filename = os.path.join("collections", collection, f"{collection}.wacz") filename = os.path.join("collections", collection, f"{collection}.wacz")
else: else:
@ -91,3 +108,55 @@ class WaczEnricher(Enricher):
return False return False
to_enrich.add_media(Media(filename), "browsertrix") to_enrich.add_media(Media(filename), "browsertrix")
if self.extract_media:
self.extract_media_from_wacz(to_enrich, filename)
return True
def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None:
"""
Receives a .wacz archive, and extracts all relevant media from it, adding them to to_enrich.
"""
logger.info(f"WACZ extract_media flag is set, extracting media from {wacz_filename=}")
# unzipping the .wacz
tmp_dir = ArchivingContext.get_tmp_dir()
unzipped_dir = os.path.join(tmp_dir, "unzipped")
with ZipFile(wacz_filename, 'r') as z_obj:
z_obj.extractall(path=unzipped_dir)
# if warc is split into multiple gzip chunks, merge those
warc_dir = os.path.join(unzipped_dir, "archive")
warc_filename = os.path.join(tmp_dir, "merged.warc")
with open(warc_filename, 'wb') as outfile:
for filename in sorted(os.listdir(warc_dir)):
if filename.endswith('.gz'):
chunk_file = os.path.join(warc_dir, filename)
with open(chunk_file, 'rb') as infile:
shutil.copyfileobj(infile, outfile)
# get media out of .warc
counter = 0
with open(warc_filename, 'rb') as warc_stream:
for record in ArchiveIterator(warc_stream):
# only include fetched resources
if record.rec_type != 'response': continue
record_url = record.rec_headers.get_header('WARC-Target-URI')
if not UrlUtil.is_relevant_url(record_url):
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
continue
# filter by media mimetypes
content_type = record.http_headers.get("Content-Type")
if not content_type: continue
if not any(x in content_type for x in ["video", "image", "audio"]): continue
# create local file and add media
ext = mimetypes.guess_extension(content_type)
fn = os.path.join(tmp_dir, f"warc-file-{counter}{ext}")
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
m = Media(filename=fn)
m.set("src", record_url)
# TODO URLUTIL to ignore known-recurring media like favicons, profile pictures, etc.
to_enrich.add_media(m, f"browsertrix-media-{counter}")
counter += 1
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")

Wyświetl plik

@ -28,6 +28,7 @@ class WaybackArchiverEnricher(Enricher, Archiver):
} }
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
# this new Metadata object is required to avoid duplication
result = Metadata() result = Metadata()
result.merge(item) result.merge(item)
if self.enrich(result): if self.enrich(result):

Wyświetl plik

@ -2,7 +2,6 @@
import os, json, requests import os, json, requests
from datetime import datetime from datetime import datetime
from loguru import logger from loguru import logger
from urllib.parse import urlparse, urlunparse
def mkdir_if_not_exists(folder): def mkdir_if_not_exists(folder):
@ -21,14 +20,6 @@ def expand_url(url):
logger.error(f'Failed to expand url {url}') logger.error(f'Failed to expand url {url}')
return url return url
def remove_get_parameters(url):
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
def getattr_or(o: object, prop: str, default=None): def getattr_or(o: object, prop: str, default=None):
try: try:
res = getattr(o, prop) res = getattr(o, prop)

Wyświetl plik

@ -1,14 +1,16 @@
import re import re
from urllib.parse import urlparse, urlunparse
class UrlUtil: class UrlUtil:
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)") telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
is_istagram = re.compile(r"https:\/\/www\.instagram\.com") is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
@staticmethod @staticmethod
def clean(url): return url def clean(url: str) -> str: return url
@staticmethod @staticmethod
def is_auth_wall(url): def is_auth_wall(url: str) -> bool:
""" """
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
""" """
@ -17,3 +19,28 @@ class UrlUtil:
return False return False
@staticmethod
def remove_get_parameters(url: str) -> str:
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
@staticmethod
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
"""
clean_url = UrlUtil.remove_get_parameters(url)
# favicons
if "favicon" in url: return False
# ifnore icons
if clean_url.endswith(".ico"): return False
# ignore SVGs
if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
# twitter profile pictures
if "twimg.com/profile_images" in url: return False
return True