kopia lustrzana https://github.com/dgtlmoon/changedetection.io
Remove actual :// links, dont consider these as part of the changes, often they include variables/trackingscript ref etc
rodzic
194ee5d528
commit
9c0c8bf6aa
|
@ -3,6 +3,10 @@ import time
|
||||||
import requests
|
import requests
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
import html2text
|
||||||
|
from urlextract import URLExtract
|
||||||
|
|
||||||
|
|
||||||
# Hmm Polymorphism datastore, thread, etc
|
# Hmm Polymorphism datastore, thread, etc
|
||||||
class perform_site_check(Thread):
|
class perform_site_check(Thread):
|
||||||
|
@ -53,17 +57,30 @@ class perform_site_check(Thread):
|
||||||
extra_headers = self.datastore.get_val(self.uuid, 'headers')
|
extra_headers = self.datastore.get_val(self.uuid, 'headers')
|
||||||
headers.update(extra_headers)
|
headers.update(extra_headers)
|
||||||
|
|
||||||
print (headers)
|
|
||||||
|
|
||||||
|
|
||||||
print("Checking", self.url)
|
print("Checking", self.url)
|
||||||
import html2text
|
|
||||||
self.ensure_output_path()
|
self.ensure_output_path()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(self.url, headers=headers, timeout=15, verify=False)
|
r = requests.get(self.url, headers=headers, timeout=15, verify=False)
|
||||||
stripped_text_from_html = html2text.html2text(r.content.decode('utf-8'))
|
stripped_text_from_html = html2text.html2text(r.content.decode('utf-8'))
|
||||||
|
|
||||||
|
# @todo This should be a config option.
|
||||||
|
# Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
|
||||||
|
|
||||||
|
extractor = URLExtract()
|
||||||
|
urls = extractor.find_urls(stripped_text_from_html)
|
||||||
|
# Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
|
||||||
|
if urls:
|
||||||
|
urls.sort(key=len, reverse=True)
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
# Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
|
||||||
|
if "://" in url:
|
||||||
|
#print ("Stripping link", url)
|
||||||
|
stripped_text_from_html = stripped_text_from_html.replace(url, '')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Usually from networkIO/requests level
|
# Usually from networkIO/requests level
|
||||||
except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
|
except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
|
||||||
|
|
Ładowanie…
Reference in New Issue