kopia lustrzana https://github.com/bellingcat/auto-archiver
				
				
				
			improves browsertrix configurations
							rodzic
							
								
									df502f3bde
								
							
						
					
					
						commit
						f0f844a569
					
				|  | @ -203,6 +203,10 @@ class Archiver(ABC): | |||
|         return self.storage.get_cdn_url(key) | ||||
| 
 | ||||
|     def get_wacz(self, url): | ||||
|         if not self.browsertrix.enabled: | ||||
|             logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.") | ||||
|             return  | ||||
| 
 | ||||
|         logger.debug(f"getting wacz for {url}") | ||||
|         key = self._get_key_from_url(url, ".wacz", append_datetime=True) | ||||
|         collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", "")) | ||||
|  | @ -219,7 +223,8 @@ class Archiver(ABC): | |||
|             "--text", | ||||
|             "--collection", collection, | ||||
|             "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", | ||||
|             "--behaviorTimeout", str(self.browsertrix.timeout_seconds) | ||||
|             "--behaviorTimeout", str(self.browsertrix.timeout_seconds), | ||||
|             "--timeout", str(self.browsertrix.timeout_seconds) | ||||
|         ] | ||||
| 
 | ||||
|         if not os.path.isdir(browsertrix_home): | ||||
|  |  | |||
|  | @ -31,8 +31,9 @@ def update_sheet(gw, row, url, result: ArchiveResult): | |||
|     batch_if_valid('duration', result.duration, str(result.duration)) | ||||
|     batch_if_valid('screenshot', result.screenshot) | ||||
|     batch_if_valid('hash', result.hash) | ||||
|     batch_if_valid('wacz', result.wacz) | ||||
|     batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') | ||||
|     if result.wacz is not None: | ||||
|         batch_if_valid('wacz', result.wacz) | ||||
|         batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') | ||||
| 
 | ||||
|     if result.timestamp is not None: | ||||
|         if type(result.timestamp) == int: | ||||
|  |  | |||
|  | @ -2,5 +2,6 @@ from dataclasses import dataclass | |||
| 
 | ||||
| @dataclass | ||||
| class BrowsertrixConfig: | ||||
|     enabled: bool | ||||
|     profile: str | ||||
|     timeout_seconds: str | ||||
|  |  | |||
|  | @ -86,6 +86,7 @@ class Config: | |||
|         if len(browsertrix_profile := browsertrix_configs.get("profile", "")): | ||||
|             browsertrix_profile = os.path.abspath(browsertrix_profile) | ||||
|         self.browsertrix_config = BrowsertrixConfig( | ||||
|             enabled=bool(browsertrix_configs.get("enabled", False)), | ||||
|             profile=browsertrix_profile, | ||||
|             timeout_seconds=browsertrix_configs.get("timeout_seconds", "90") | ||||
|         ) | ||||
|  |  | |||
|  | @ -103,10 +103,12 @@ execution: | |||
|     window_width: 1400 | ||||
|     window_height: 2000 | ||||
| 
 | ||||
|   # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) | ||||
|   # optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) | ||||
|   # browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage | ||||
|   browsertrix: | ||||
|     enabled: true # defaults to false | ||||
|     profile: "./browsertrix/crawls/profile.tar.gz" | ||||
|     timeout_seconds: 90 # defaults to 90s | ||||
|     timeout_seconds: 120 # defaults to 90s | ||||
|   # puts execution logs into /logs folder, defaults to false | ||||
|   save_logs: true | ||||
|   # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" | ||||
|  |  | |||
		Ładowanie…
	
		Reference in New Issue
	
	 msramalho
						msramalho