kopia lustrzana https://github.com/bellingcat/auto-archiver
				
				
				
			improves browsertrix configurations
							rodzic
							
								
									df502f3bde
								
							
						
					
					
						commit
						f0f844a569
					
				| 
						 | 
				
			
			@ -203,6 +203,10 @@ class Archiver(ABC):
 | 
			
		|||
        return self.storage.get_cdn_url(key)
 | 
			
		||||
 | 
			
		||||
    def get_wacz(self, url):
 | 
			
		||||
        if not self.browsertrix.enabled:
 | 
			
		||||
            logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
 | 
			
		||||
            return 
 | 
			
		||||
 | 
			
		||||
        logger.debug(f"getting wacz for {url}")
 | 
			
		||||
        key = self._get_key_from_url(url, ".wacz", append_datetime=True)
 | 
			
		||||
        collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
 | 
			
		||||
| 
						 | 
				
			
			@ -219,7 +223,8 @@ class Archiver(ABC):
 | 
			
		|||
            "--text",
 | 
			
		||||
            "--collection", collection,
 | 
			
		||||
            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
 | 
			
		||||
            "--behaviorTimeout", str(self.browsertrix.timeout_seconds)
 | 
			
		||||
            "--behaviorTimeout", str(self.browsertrix.timeout_seconds),
 | 
			
		||||
            "--timeout", str(self.browsertrix.timeout_seconds)
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
        if not os.path.isdir(browsertrix_home):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -31,6 +31,7 @@ def update_sheet(gw, row, url, result: ArchiveResult):
 | 
			
		|||
    batch_if_valid('duration', result.duration, str(result.duration))
 | 
			
		||||
    batch_if_valid('screenshot', result.screenshot)
 | 
			
		||||
    batch_if_valid('hash', result.hash)
 | 
			
		||||
    if result.wacz is not None:
 | 
			
		||||
        batch_if_valid('wacz', result.wacz)
 | 
			
		||||
        batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,5 +2,6 @@ from dataclasses import dataclass
 | 
			
		|||
 | 
			
		||||
@dataclass
 | 
			
		||||
class BrowsertrixConfig:
 | 
			
		||||
    enabled: bool
 | 
			
		||||
    profile: str
 | 
			
		||||
    timeout_seconds: str
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -86,6 +86,7 @@ class Config:
 | 
			
		|||
        if len(browsertrix_profile := browsertrix_configs.get("profile", "")):
 | 
			
		||||
            browsertrix_profile = os.path.abspath(browsertrix_profile)
 | 
			
		||||
        self.browsertrix_config = BrowsertrixConfig(
 | 
			
		||||
            enabled=bool(browsertrix_configs.get("enabled", False)),
 | 
			
		||||
            profile=browsertrix_profile,
 | 
			
		||||
            timeout_seconds=browsertrix_configs.get("timeout_seconds", "90")
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -103,10 +103,12 @@ execution:
 | 
			
		|||
    window_width: 1400
 | 
			
		||||
    window_height: 2000
 | 
			
		||||
 | 
			
		||||
  # optional browsertrix profile file (see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
 | 
			
		||||
  # optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
 | 
			
		||||
  # browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage
 | 
			
		||||
  browsertrix:
 | 
			
		||||
    enabled: true # defaults to false
 | 
			
		||||
    profile: "./browsertrix/crawls/profile.tar.gz"
 | 
			
		||||
    timeout_seconds: 90 # defaults to 90s
 | 
			
		||||
    timeout_seconds: 120 # defaults to 90s
 | 
			
		||||
  # puts execution logs into /logs folder, defaults to false
 | 
			
		||||
  save_logs: true
 | 
			
		||||
  # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Ładowanie…
	
		Reference in New Issue