2023-12-05 22:24:12 +00:00
import jsonlines
2023-07-27 14:42:10 +00:00
import mimetypes
2023-12-12 19:12:03 +00:00
import os , shutil , subprocess
2023-07-27 14:42:10 +00:00
from zipfile import ZipFile
2023-01-19 00:27:11 +00:00
from loguru import logger
2023-07-27 14:42:10 +00:00
from warcio . archiveiterator import ArchiveIterator
2023-01-19 00:27:11 +00:00
2023-03-23 11:17:38 +00:00
from . . core import Media , Metadata , ArchivingContext
2023-01-21 19:01:02 +00:00
from . import Enricher
2023-07-27 14:42:10 +00:00
from . . archivers import Archiver
2023-12-12 19:12:03 +00:00
from . . utils import UrlUtil , random_str
2023-01-21 19:01:02 +00:00
2023-01-19 00:27:11 +00:00
2023-07-27 14:42:10 +00:00
class WaczArchiverEnricher ( Enricher , Archiver ) :
2023-01-19 00:27:11 +00:00
"""
2023-07-27 14:42:10 +00:00
Uses https : / / github . com / webrecorder / browsertrix - crawler to generate a . WACZ archive of the URL
If used with [ profiles ] ( https : / / github . com / webrecorder / browsertrix - crawler #creating-and-using-browser-profiles)
it can become quite powerful for archiving private content .
When used as an archiver it will extract the media from the . WACZ archive so it can be enriched .
2023-01-19 00:27:11 +00:00
"""
2023-07-27 14:42:10 +00:00
name = " wacz_archiver_enricher "
2023-01-19 00:27:11 +00:00
def __init__ ( self , config : dict ) - > None :
# without this STEP.__init__ is not called
super ( ) . __init__ ( config )
@staticmethod
def configs ( ) - > dict :
return {
" profile " : { " default " : None , " help " : " browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles). " } ,
2023-09-12 19:07:21 +00:00
" docker_commands " : { " default " : None , " help " : " if a custom docker invocation is needed " } ,
2023-07-27 14:42:10 +00:00
" timeout " : { " default " : 120 , " help " : " timeout for WACZ generation in seconds " } ,
2023-12-20 14:13:22 +00:00
" extract_media " : { " default " : False , " help " : " If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched. " } ,
2024-02-20 18:05:29 +00:00
" extract_screenshot " : { " default " : True , " help " : " If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched. " } ,
" socks_proxy_host " : { " default " : None , " help " : " SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host " } ,
" socks_proxy_port " : { " default " : None , " help " : " SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234 " } ,
2023-01-19 00:27:11 +00:00
}
2023-07-27 14:42:10 +00:00
def download ( self , item : Metadata ) - > Metadata :
# this new Metadata object is required to avoid duplication
result = Metadata ( )
result . merge ( item )
if self . enrich ( result ) :
return result . success ( " wacz " )
2023-01-19 00:27:11 +00:00
def enrich ( self , to_enrich : Metadata ) - > bool :
2023-07-27 14:42:10 +00:00
if to_enrich . get_media_by_id ( " browsertrix " ) :
logger . info ( f " WACZ enricher had already been executed: { to_enrich . get_media_by_id ( ' browsertrix ' ) } " )
return True
2023-01-19 00:27:11 +00:00
url = to_enrich . get_url ( )
2023-07-27 14:42:10 +00:00
2023-12-12 19:12:03 +00:00
collection = random_str ( 8 )
2023-09-14 16:49:37 +00:00
browsertrix_home_host = os . environ . get ( ' BROWSERTRIX_HOME_HOST ' ) or os . path . abspath ( ArchivingContext . get_tmp_dir ( ) )
browsertrix_home_container = os . environ . get ( ' BROWSERTRIX_HOME_CONTAINER ' ) or browsertrix_home_host
2023-07-27 14:42:10 +00:00
2023-09-14 16:49:37 +00:00
cmd = [
" crawl " ,
" --url " , url ,
" --scopeType " , " page " ,
" --generateWACZ " ,
" --text " ,
" --screenshot " , " fullPage " ,
" --collection " , collection ,
" --id " , collection ,
" --saveState " , " never " ,
" --behaviors " , " autoscroll,autoplay,autofetch,siteSpecific " ,
" --behaviorTimeout " , str ( self . timeout ) ,
" --timeout " , str ( self . timeout ) ]
2023-05-09 10:12:02 +00:00
2023-09-14 16:49:37 +00:00
# call docker if explicitly enabled or we are running on the host (not in docker)
use_docker = os . environ . get ( ' WACZ_ENABLE_DOCKER ' ) or not os . environ . get ( ' RUNNING_IN_DOCKER ' )
2023-07-27 14:42:10 +00:00
2023-09-14 16:49:37 +00:00
if use_docker :
2023-05-09 10:12:02 +00:00
logger . debug ( f " generating WACZ in Docker for { url =} " )
2023-09-14 16:49:37 +00:00
logger . debug ( f " { browsertrix_home_host =} { browsertrix_home_container =} " )
2023-09-15 18:35:35 +00:00
if self . docker_commands :
cmd = self . docker_commands + cmd
else :
cmd = [ " docker " , " run " , " --rm " , " -v " , f " { browsertrix_home_host } :/crawls/ " , " webrecorder/browsertrix-crawler " ] + cmd
2023-05-09 10:12:02 +00:00
2023-05-09 15:38:17 +00:00
if self . profile :
2023-09-14 16:49:37 +00:00
profile_fn = os . path . join ( browsertrix_home_container , " profile.tar.gz " )
2023-09-12 19:07:21 +00:00
logger . debug ( f " copying { self . profile } to { profile_fn } " )
2023-05-09 15:38:17 +00:00
shutil . copyfile ( self . profile , profile_fn )
2023-05-11 13:08:27 +00:00
cmd . extend ( [ " --profile " , os . path . join ( " /crawls " , " profile.tar.gz " ) ] )
2023-01-19 00:27:11 +00:00
2023-09-14 16:49:37 +00:00
else :
logger . debug ( f " generating WACZ without Docker for { url =} " )
if self . profile :
cmd . extend ( [ " --profile " , os . path . join ( " /app " , str ( self . profile ) ) ] )
2023-01-19 00:27:11 +00:00
try :
logger . info ( f " Running browsertrix-crawler: { ' ' . join ( cmd ) } " )
2024-02-20 18:05:29 +00:00
if self . socks_proxy_host and self . socks_proxy_port :
logger . debug ( " Using SOCKS proxy for browsertrix-crawler " )
my_env = os . environ . copy ( )
my_env [ " SOCKS_HOST " ] = self . socks_proxy_host
my_env [ " SOCKS_PORT " ] = str ( self . socks_proxy_port )
subprocess . run ( cmd , check = True , env = my_env )
2023-01-19 00:27:11 +00:00
except Exception as e :
logger . error ( f " WACZ generation failed: { e } " )
return False
2023-09-14 16:49:37 +00:00
if use_docker :
2023-09-15 18:52:47 +00:00
wacz_fn = os . path . join ( browsertrix_home_container , " collections " , collection , f " { collection } .wacz " )
2023-05-09 15:38:17 +00:00
else :
2023-09-15 18:52:47 +00:00
wacz_fn = os . path . join ( " collections " , collection , f " { collection } .wacz " )
2023-07-27 14:42:10 +00:00
2023-09-15 18:52:47 +00:00
if not os . path . exists ( wacz_fn ) :
logger . warning ( f " Unable to locate and upload WACZ { wacz_fn =} " )
2023-01-19 00:27:11 +00:00
return False
2023-09-15 18:52:47 +00:00
to_enrich . add_media ( Media ( wacz_fn ) , " browsertrix " )
2023-12-20 14:13:22 +00:00
if self . extract_media or self . extract_screenshot :
2023-09-15 18:52:47 +00:00
self . extract_media_from_wacz ( to_enrich , wacz_fn )
2023-12-05 22:24:12 +00:00
if use_docker :
jsonl_fn = os . path . join ( browsertrix_home_container , " collections " , collection , " pages " , " pages.jsonl " )
else :
jsonl_fn = os . path . join ( " collections " , collection , " pages " , " pages.jsonl " )
if not os . path . exists ( jsonl_fn ) :
logger . warning ( f " Unable to locate and pages.jsonl { jsonl_fn =} " )
else :
logger . info ( f " Parsing pages.jsonl { jsonl_fn =} " )
with jsonlines . open ( jsonl_fn ) as reader :
for obj in reader :
if ' title ' in obj :
to_enrich . set_title ( obj [ ' title ' ] )
if ' text ' in obj :
to_enrich . set_content ( obj [ ' text ' ] )
2023-07-27 14:42:10 +00:00
return True
def extract_media_from_wacz ( self , to_enrich : Metadata , wacz_filename : str ) - > None :
"""
Receives a . wacz archive , and extracts all relevant media from it , adding them to to_enrich .
"""
logger . info ( f " WACZ extract_media flag is set, extracting media from { wacz_filename =} " )
# unzipping the .wacz
tmp_dir = ArchivingContext . get_tmp_dir ( )
unzipped_dir = os . path . join ( tmp_dir , " unzipped " )
with ZipFile ( wacz_filename , ' r ' ) as z_obj :
z_obj . extractall ( path = unzipped_dir )
# if warc is split into multiple gzip chunks, merge those
warc_dir = os . path . join ( unzipped_dir , " archive " )
warc_filename = os . path . join ( tmp_dir , " merged.warc " )
with open ( warc_filename , ' wb ' ) as outfile :
for filename in sorted ( os . listdir ( warc_dir ) ) :
if filename . endswith ( ' .gz ' ) :
chunk_file = os . path . join ( warc_dir , filename )
with open ( chunk_file , ' rb ' ) as infile :
shutil . copyfileobj ( infile , outfile )
# get media out of .warc
counter = 0
2023-07-27 20:36:25 +00:00
seen_urls = set ( )
2023-07-27 14:42:10 +00:00
with open ( warc_filename , ' rb ' ) as warc_stream :
for record in ArchiveIterator ( warc_stream ) :
# only include fetched resources
2023-12-20 14:13:22 +00:00
if record . rec_type == " resource " and self . extract_screenshot : # screenshots
2023-07-27 20:36:25 +00:00
fn = os . path . join ( tmp_dir , f " warc-file- { counter } .png " )
with open ( fn , " wb " ) as outf : outf . write ( record . raw_stream . read ( ) )
m = Media ( filename = fn )
to_enrich . add_media ( m , " browsertrix-screenshot " )
counter + = 1
2023-12-20 14:13:22 +00:00
if not self . extract_media : continue
2023-07-27 20:36:25 +00:00
2023-07-27 14:42:10 +00:00
if record . rec_type != ' response ' : continue
record_url = record . rec_headers . get_header ( ' WARC-Target-URI ' )
if not UrlUtil . is_relevant_url ( record_url ) :
logger . debug ( f " Skipping irrelevant URL { record_url } but it ' s still present in the WACZ. " )
continue
2023-07-27 20:36:25 +00:00
if record_url in seen_urls :
logger . debug ( f " Skipping already seen URL { record_url } . " )
continue
2023-07-27 14:42:10 +00:00
# filter by media mimetypes
content_type = record . http_headers . get ( " Content-Type " )
if not content_type : continue
if not any ( x in content_type for x in [ " video " , " image " , " audio " ] ) : continue
# create local file and add media
ext = mimetypes . guess_extension ( content_type )
2023-07-27 20:36:25 +00:00
warc_fn = f " warc-file- { counter } { ext } "
fn = os . path . join ( tmp_dir , warc_fn )
record_url_best_qual = UrlUtil . twitter_best_quality_url ( record_url )
2023-07-27 14:42:10 +00:00
with open ( fn , " wb " ) as outf : outf . write ( record . raw_stream . read ( ) )
2023-07-27 20:36:25 +00:00
2023-07-27 14:42:10 +00:00
m = Media ( filename = fn )
m . set ( " src " , record_url )
2023-07-27 20:36:25 +00:00
# if a link with better quality exists, try to download that
if record_url_best_qual != record_url :
try :
2024-02-20 18:05:29 +00:00
m . filename = self . download_from_url ( record_url_best_qual , warc_fn )
2023-07-27 20:36:25 +00:00
m . set ( " src " , record_url_best_qual )
m . set ( " src_alternative " , record_url )
except Exception as e : logger . warning ( f " Unable to download best quality URL for { record_url =} got error { e } , using original in WARC. " )
2023-07-28 11:19:14 +00:00
# remove bad videos
if m . is_video ( ) and not m . is_valid_video ( ) : continue
2023-07-27 20:36:25 +00:00
to_enrich . add_media ( m , warc_fn )
2023-07-27 14:42:10 +00:00
counter + = 1
2023-07-27 20:36:25 +00:00
seen_urls . add ( record_url )
2023-07-27 14:42:10 +00:00
logger . info ( f " WACZ extract_media finished, found { counter } relevant media file(s) " )