Remove actual :// links, dont consider these as part of the changes, often they include variables/trackingscript ref etc

2021-01-28 14:45:01 +01:00 · 2021-01-28 14:45:01 +01:00 · 9c0c8bf6aa
commit 9c0c8bf6aa
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -3,6 +3,10 @@ import time
 import requests
 import hashlib
 import os
 import re
 import html2text
 from urlextract import URLExtract
 # Hmm Polymorphism datastore, thread, etc
 class perform_site_check(Thread):
@ -53,17 +57,30 @@ class perform_site_check(Thread):
        extra_headers = self.datastore.get_val(self.uuid, 'headers')
        headers.update(extra_headers)
        print (headers)
        print("Checking", self.url)
-        import html2text
+
        self.ensure_output_path()
        try:
            r = requests.get(self.url, headers=headers, timeout=15, verify=False)
            stripped_text_from_html = html2text.html2text(r.content.decode('utf-8'))
            # @todo This should be a config option.
            # Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
            extractor = URLExtract()
            urls = extractor.find_urls(stripped_text_from_html)
            # Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
            if urls:
                urls.sort(key=len, reverse=True)
                for url in urls:
                    # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
                    if "://" in url:
                        #print ("Stripping link", url)
                        stripped_text_from_html = stripped_text_from_html.replace(url, '')
        # Usually from networkIO/requests level
        except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e: