Remove actual :// links, dont consider these as part of the changes, often they include variables/trackingscript ref etc

2021-01-28 14:45:01 +01:00 · 2021-01-28 14:45:01 +01:00 · 9c0c8bf6aa
commit 9c0c8bf6aa
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -3,6 +3,10 @@ import time
 import requests
 import hashlib
 import os
+import re
+import html2text
+from urlextract import URLExtract
+

 # Hmm Polymorphism datastore, thread, etc
 class perform_site_check(Thread):
@ -53,17 +57,30 @@ class perform_site_check(Thread):
        extra_headers = self.datastore.get_val(self.uuid, 'headers')
        headers.update(extra_headers)

-        print (headers)
-
-
        print("Checking", self.url)
-        import html2text
+
        self.ensure_output_path()

        try:
            r = requests.get(self.url, headers=headers, timeout=15, verify=False)
            stripped_text_from_html = html2text.html2text(r.content.decode('utf-8'))

+            # @todo This should be a config option.
+            # Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
+
+            extractor = URLExtract()
+            urls = extractor.find_urls(stripped_text_from_html)
+            # Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
+            if urls:
+                urls.sort(key=len, reverse=True)
+
+                for url in urls:
+                    # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
+                    if "://" in url:
+                        #print ("Stripping link", url)
+                        stripped_text_from_html = stripped_text_from_html.replace(url, '')
+
+

        # Usually from networkIO/requests level
        except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e: