kopia lustrzana https://gitlab.com/jaywink/federation
Improve URL_PATTERN.
rodzic
5c168d6630
commit
5dac605c4b
|
@ -10,7 +10,10 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
|
||||||
TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
|
TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
|
||||||
# This will match non matching braces. I don't think it's an issue.
|
# This will match non matching braces. I don't think it's an issue.
|
||||||
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
|
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
|
||||||
URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE)
|
# based on https://stackoverflow.com/a/6041965
|
||||||
|
URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|\b(?:\w+\.)+\w+)(?:(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))?\))+(?:\((?:[^\s()<>]+|(?:\(?:[^\s()<>]+\)))?\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))?)',
|
||||||
|
re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
def decode_if_bytes(text):
|
def decode_if_bytes(text):
|
||||||
try:
|
try:
|
||||||
|
@ -52,7 +55,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
|
||||||
final = []
|
final = []
|
||||||
for candidate in soup.find_all(string=True):
|
for candidate in soup.find_all(string=True):
|
||||||
if candidate.parent.name == 'code': continue
|
if candidate.parent.name == 'code': continue
|
||||||
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
|
ns = [NavigableString(r) for r in pattern.split(candidate.text) if r]
|
||||||
found = [s for s in ns if pattern.match(s.text)]
|
found = [s for s in ns if pattern.match(s.text)]
|
||||||
if found:
|
if found:
|
||||||
candidate.replace_with(*ns)
|
candidate.replace_with(*ns)
|
||||||
|
|
Ładowanie…
Reference in New Issue