kopia lustrzana https://gitlab.com/jaywink/federation
Match links with no http prefix. Remove trailing garbage from tags.
rodzic
33366802c4
commit
4b5a886492
|
@ -849,11 +849,13 @@ class Note(Object, RawContentMixin):
|
|||
|
||||
for link in self._soup.find_all('a', href=True):
|
||||
parsed = urlparse(link['href'].lower())
|
||||
# remove the query part, if any
|
||||
url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}'
|
||||
# remove the query part and trailing garbage, if any
|
||||
path = re.match(r'(/[\w/]+)', parsed.path).group()
|
||||
url = f'{parsed.scheme}://{parsed.netloc}{path}'
|
||||
links = {link['href'].lower(), url}
|
||||
if links.intersection(hrefs):
|
||||
link['data-hashtag'] = link.text.lstrip('#').lower()
|
||||
tag = re.match(r'#?([\w]+)', link.text).group(1).lower()
|
||||
link['data-hashtag'] = tag
|
||||
|
||||
def _find_and_mark_mentions(self):
|
||||
mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)]
|
||||
|
|
|
@ -10,7 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
|
|||
TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
|
||||
# This will match non matching braces. I don't think it's an issue.
|
||||
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
|
||||
URL_PATTERN = re.compile(r'(https?://[\w_\-.#?&/]+)', re.UNICODE)
|
||||
URL_PATTERN = re.compile(r'((?:https?://)?[\w_\-.#?&/~@!$()*,;%=+]+)', re.UNICODE)
|
||||
|
||||
def decode_if_bytes(text):
|
||||
try:
|
||||
|
@ -52,7 +52,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
|
|||
for candidate in soup.find_all(string=True):
|
||||
if candidate.parent.name == 'code': continue
|
||||
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
|
||||
candidate.replace_with(*ns)
|
||||
if ns: candidate.replace_with(*ns)
|
||||
return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z')))
|
||||
|
||||
|
||||
|
|
Ładowanie…
Reference in New Issue