2017-09-12 14:31:50 +00:00
|
|
|
# coding=utf-8
|
2017-08-15 06:07:24 +00:00
|
|
|
"""Misc common utilities.
|
|
|
|
"""
|
2023-05-27 21:57:29 +00:00
|
|
|
import base64
|
2023-03-08 21:10:41 +00:00
|
|
|
from datetime import timedelta
|
2017-08-15 06:07:24 +00:00
|
|
|
import logging
|
2017-09-13 14:48:32 +00:00
|
|
|
import re
|
2023-03-11 06:24:58 +00:00
|
|
|
import threading
|
2019-12-26 06:20:57 +00:00
|
|
|
import urllib.parse
|
2017-08-15 06:07:24 +00:00
|
|
|
|
2023-03-11 06:24:58 +00:00
|
|
|
import cachetools
|
2023-05-27 21:57:29 +00:00
|
|
|
from Crypto.Util import number
|
2023-03-20 21:28:14 +00:00
|
|
|
from flask import abort, g, make_response, request
|
2023-03-11 06:24:58 +00:00
|
|
|
from oauth_dropins.webutil import util, webmention
|
2023-01-08 20:01:31 +00:00
|
|
|
from oauth_dropins.webutil.appengine_info import DEBUG
|
2023-06-10 14:53:07 +00:00
|
|
|
from werkzeug.exceptions import BadRequest
|
2017-10-16 14:13:43 +00:00
|
|
|
|
2022-02-12 06:38:56 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2023-06-09 17:58:28 +00:00
|
|
|
# allow hostname chars (a-z, 0-9, -), allow arbitrary unicode (eg ☃.net), don't
|
|
|
|
# allow specific chars that we'll often see in webfinger, AP handles, etc. (@, :)
|
|
|
|
# https://stackoverflow.com/questions/10306690/what-is-a-regular-expression-which-will-match-a-valid-domain-name-without-a-subd
|
|
|
|
#
|
|
|
|
# TODO: preprocess with domain2idna, then narrow this to just [a-z0-9-]
|
2023-06-16 19:05:41 +00:00
|
|
|
DOMAIN_RE = r'^[^/:;@?!\']+\.[^/:@_?!\']+$'
|
2020-03-03 15:57:52 +00:00
|
|
|
TLD_BLOCKLIST = ('7z', 'asp', 'aspx', 'gif', 'html', 'ico', 'jpg', 'jpeg', 'js',
|
|
|
|
'json', 'php', 'png', 'rar', 'txt', 'yaml', 'yml', 'zip')
|
2017-09-12 14:31:50 +00:00
|
|
|
|
2019-12-26 06:20:57 +00:00
|
|
|
CONTENT_TYPE_HTML = 'text/html; charset=utf-8'
|
2017-10-20 14:49:25 +00:00
|
|
|
|
2020-12-30 18:26:48 +00:00
|
|
|
PRIMARY_DOMAIN = 'fed.brid.gy'
|
2023-06-11 02:50:31 +00:00
|
|
|
# protocol-specific subdomains are under this "super"domain
|
|
|
|
SUPERDOMAIN = '.brid.gy'
|
2023-06-10 23:29:58 +00:00
|
|
|
# TODO: add a Flask route decorator version of util.canonicalize_domain, then
|
|
|
|
# use it to canonicalize most UI routes from these to fed.brid.gy.
|
2020-12-30 18:26:48 +00:00
|
|
|
OTHER_DOMAINS = (
|
2023-06-10 23:29:58 +00:00
|
|
|
'ap.brid.gy',
|
|
|
|
'atp.brid.gy',
|
|
|
|
'bluesky.brid.gy',
|
|
|
|
'bsky.brid.gy',
|
2020-12-30 18:26:48 +00:00
|
|
|
'bridgy-federated.appspot.com',
|
2023-01-05 23:03:21 +00:00
|
|
|
'bridgy-federated.uc.r.appspot.com',
|
2023-06-10 23:29:58 +00:00
|
|
|
'nostr.brid.gy',
|
|
|
|
'web.brid.gy',
|
2020-12-30 18:26:48 +00:00
|
|
|
)
|
2023-01-05 23:03:21 +00:00
|
|
|
LOCAL_DOMAINS = (
|
|
|
|
'localhost',
|
|
|
|
'localhost:8080',
|
|
|
|
'my.dev.com:8080',
|
|
|
|
)
|
|
|
|
DOMAINS = (PRIMARY_DOMAIN,) + OTHER_DOMAINS + LOCAL_DOMAINS
|
2022-11-14 15:07:33 +00:00
|
|
|
# TODO: unify with Bridgy's
|
|
|
|
DOMAIN_BLOCKLIST = frozenset((
|
2022-12-28 14:34:22 +00:00
|
|
|
# https://github.com/snarfed/bridgy-fed/issues/348
|
|
|
|
'aaronparecki.com',
|
2022-11-14 15:07:33 +00:00
|
|
|
'facebook.com',
|
|
|
|
'fb.com',
|
|
|
|
't.co',
|
|
|
|
'twitter.com',
|
|
|
|
) + DOMAINS)
|
2017-10-24 04:49:43 +00:00
|
|
|
|
2023-01-25 00:13:22 +00:00
|
|
|
CACHE_TIME = timedelta(seconds=60)
|
2023-02-15 18:57:11 +00:00
|
|
|
|
2017-08-15 06:07:24 +00:00
|
|
|
|
2023-06-10 14:53:07 +00:00
|
|
|
class NoMicroformats(BadRequest):
|
|
|
|
"""Raised by :meth:`Web.fetch` when a page has no microformats2."""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2023-05-27 21:57:29 +00:00
|
|
|
def base64_to_long(x):
|
|
|
|
"""Converts x from URL safe base64 encoding to a long integer.
|
|
|
|
|
|
|
|
Originally from django_salmon.magicsigs. Used in :meth:`User.public_pem`
|
|
|
|
and :meth:`User.private_pem`.
|
|
|
|
"""
|
|
|
|
return number.bytes_to_long(base64.urlsafe_b64decode(x))
|
|
|
|
|
|
|
|
|
|
|
|
def long_to_base64(x):
|
|
|
|
"""Converts x from a long integer to base64 URL safe encoding.
|
|
|
|
|
|
|
|
Originally from django_salmon.magicsigs. Used in :meth:`User.get_or_create`.
|
|
|
|
"""
|
|
|
|
return base64.urlsafe_b64encode(number.long_to_bytes(x))
|
|
|
|
|
|
|
|
|
2023-01-05 23:03:21 +00:00
|
|
|
def host_url(path_query=None):
|
2023-06-16 04:22:20 +00:00
|
|
|
base = request.host_url
|
2023-06-20 18:22:54 +00:00
|
|
|
if (util.domain_or_parent_in(request.host, OTHER_DOMAINS)
|
|
|
|
# when running locally against prod datastore
|
|
|
|
or (not DEBUG and request.host in LOCAL_DOMAINS)):
|
|
|
|
base = f'https://{PRIMARY_DOMAIN}'
|
2023-01-08 20:01:31 +00:00
|
|
|
|
2023-06-16 04:22:20 +00:00
|
|
|
return urllib.parse.urljoin(base, path_query)
|
2023-01-05 23:03:21 +00:00
|
|
|
|
|
|
|
|
2023-04-04 14:14:31 +00:00
|
|
|
def error(msg, status=400, exc_info=None, **kwargs):
|
2023-03-08 21:10:41 +00:00
|
|
|
"""Like flask_util.error, but wraps body in JSON."""
|
2023-04-04 14:14:31 +00:00
|
|
|
logger.info(f'Returning {status}: {msg}', exc_info=exc_info)
|
|
|
|
abort(status, response=make_response({'error': msg}, status), **kwargs)
|
2017-08-15 06:07:24 +00:00
|
|
|
|
|
|
|
|
2023-03-20 21:28:14 +00:00
|
|
|
def pretty_link(url, text=None, **kwargs):
|
2023-06-20 18:22:54 +00:00
|
|
|
"""Wrapper around util.pretty_link() that converts Mastodon user URLs to @-@.
|
2023-02-07 05:08:52 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
Eg for URLs like https://mastodon.social/@foo and
|
|
|
|
https://mastodon.social/users/foo, defaults text to @foo@mastodon.social if
|
|
|
|
it's not provided.
|
2023-02-07 05:28:40 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
Args:
|
|
|
|
url: str
|
|
|
|
text: str
|
|
|
|
kwargs: passed through to :func:`webutil.util.pretty_link`
|
|
|
|
"""
|
|
|
|
if g.user and g.user.is_web_url(url):
|
|
|
|
return g.user.user_page_link()
|
2023-02-07 05:28:40 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
if text is None:
|
|
|
|
match = re.match(r'https?://([^/]+)/(@|users/)([^/]+)$', url)
|
|
|
|
if match:
|
|
|
|
text = match.expand(r'@\3@\1')
|
2023-02-07 05:08:52 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
return util.pretty_link(url, text=text, **kwargs)
|
2023-02-07 05:08:52 +00:00
|
|
|
|
|
|
|
|
2017-10-21 03:35:07 +00:00
|
|
|
def content_type(resp):
|
2021-07-12 20:49:48 +00:00
|
|
|
"""Returns a :class:`requests.Response`'s Content-Type, without charset suffix."""
|
2017-10-21 03:35:07 +00:00
|
|
|
type = resp.headers.get('Content-Type')
|
|
|
|
if type:
|
|
|
|
return type.split(';')[0]
|
|
|
|
|
|
|
|
|
2023-07-02 21:55:05 +00:00
|
|
|
def is_blocklisted(url):
|
|
|
|
"""Returns True if the given URL is in our domain blocklist, False otherwise.
|
2022-11-14 15:07:33 +00:00
|
|
|
|
|
|
|
Args:
|
2023-07-02 21:55:05 +00:00
|
|
|
url: str
|
2022-11-14 15:07:33 +00:00
|
|
|
|
2023-07-02 21:55:05 +00:00
|
|
|
Returns: boolean
|
2022-11-14 15:07:33 +00:00
|
|
|
"""
|
2023-07-02 21:55:05 +00:00
|
|
|
return util.domain_or_parent_in(util.domain_from_link(url), DOMAIN_BLOCKLIST)
|
2022-11-14 15:07:33 +00:00
|
|
|
|
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
def redirect_wrap(url):
|
|
|
|
"""Returns a URL on our domain that redirects to this URL.
|
2018-10-14 14:58:17 +00:00
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
...to satisfy Mastodon's non-standard domain matching requirement. :(
|
2018-10-14 14:58:17 +00:00
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
Args:
|
|
|
|
url: string
|
2018-10-17 14:00:31 +00:00
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599
|
|
|
|
https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747
|
2018-10-17 14:00:31 +00:00
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
Returns: string, redirect url
|
|
|
|
"""
|
2023-03-21 02:17:55 +00:00
|
|
|
if not url or url.startswith(host_url()):
|
2021-07-08 04:02:13 +00:00
|
|
|
return url
|
2018-10-17 14:00:31 +00:00
|
|
|
|
2023-03-21 02:17:55 +00:00
|
|
|
return host_url('/r/') + url
|
2020-01-31 15:38:58 +00:00
|
|
|
|
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
def redirect_unwrap(val):
|
|
|
|
"""Removes our redirect wrapping from a URL, if it's there.
|
2020-01-31 15:38:58 +00:00
|
|
|
|
2022-12-10 17:01:04 +00:00
|
|
|
val may be a string, dict, or list. dicts and lists are unwrapped
|
2021-07-08 04:02:13 +00:00
|
|
|
recursively.
|
|
|
|
|
|
|
|
Strings that aren't wrapped URLs are left unchanged.
|
|
|
|
|
|
|
|
Args:
|
2022-12-10 17:01:04 +00:00
|
|
|
val: string or dict or list
|
2021-07-08 04:02:13 +00:00
|
|
|
|
|
|
|
Returns: string, unwrapped url
|
|
|
|
"""
|
|
|
|
if isinstance(val, dict):
|
|
|
|
return {k: redirect_unwrap(v) for k, v in val.items()}
|
2018-10-17 14:49:04 +00:00
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
elif isinstance(val, list):
|
|
|
|
return [redirect_unwrap(v) for v in val]
|
2019-04-16 14:59:29 +00:00
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
elif isinstance(val, str):
|
2023-01-05 23:03:21 +00:00
|
|
|
prefix = host_url('/r/')
|
2021-07-08 04:02:13 +00:00
|
|
|
if val.startswith(prefix):
|
2022-12-10 17:01:04 +00:00
|
|
|
unwrapped = val.removeprefix(prefix)
|
|
|
|
if util.is_web(unwrapped):
|
2023-02-20 16:28:16 +00:00
|
|
|
return unwrapped
|
2023-01-05 23:03:21 +00:00
|
|
|
elif val.startswith(host_url()):
|
|
|
|
path = val.removeprefix(host_url())
|
2022-12-10 17:01:04 +00:00
|
|
|
if re.match(DOMAIN_RE, path):
|
2023-02-20 16:28:16 +00:00
|
|
|
return f'https://{path}/'
|
2019-04-16 14:59:29 +00:00
|
|
|
|
2021-07-08 04:02:13 +00:00
|
|
|
return val
|
2022-11-19 02:49:34 +00:00
|
|
|
|
|
|
|
|
2023-03-11 20:14:48 +00:00
|
|
|
def webmention_endpoint_cache_key(url):
|
2023-06-20 18:22:54 +00:00
|
|
|
"""Returns cache key for a cached webmention endpoint for a given URL.
|
2023-03-11 20:14:48 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
Just the domain by default. If the URL is the home page, ie path is / , the
|
|
|
|
key includes a / at the end, so that we cache webmention endpoints for home
|
|
|
|
pages separate from other pages. https://github.com/snarfed/bridgy/issues/701
|
2023-03-11 20:14:48 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
Example: 'snarfed.org /'
|
2023-03-11 20:14:48 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
https://github.com/snarfed/bridgy-fed/issues/423
|
2023-03-11 20:14:48 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
Adapted from bridgy/util.py.
|
|
|
|
"""
|
|
|
|
parsed = urllib.parse.urlparse(url)
|
|
|
|
key = parsed.netloc
|
|
|
|
if parsed.path in ('', '/'):
|
|
|
|
key += ' /'
|
2023-03-11 20:14:48 +00:00
|
|
|
|
2023-06-20 18:22:54 +00:00
|
|
|
# logger.debug(f'wm cache key {key}')
|
|
|
|
return key
|
2023-03-11 20:14:48 +00:00
|
|
|
|
|
|
|
|
2023-03-11 06:24:58 +00:00
|
|
|
@cachetools.cached(cachetools.TTLCache(50000, 60 * 60 * 2), # 2h expiration
|
2023-03-11 20:14:48 +00:00
|
|
|
key=webmention_endpoint_cache_key,
|
|
|
|
lock=threading.Lock(),
|
|
|
|
info=True)
|
2023-03-11 06:24:58 +00:00
|
|
|
def webmention_discover(url, **kwargs):
|
2023-05-27 00:40:29 +00:00
|
|
|
"""Thin caching wrapper around :func:`web.discover`."""
|
2023-03-11 06:24:58 +00:00
|
|
|
return webmention.discover(url, **kwargs)
|
2023-06-30 05:15:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
def add(seq, val):
|
|
|
|
"""Appends val to seq if seq doesn't already contain it.
|
|
|
|
|
|
|
|
Useful for treating repeated ndb properties like sets instead of lists.
|
|
|
|
"""
|
|
|
|
if val not in seq:
|
|
|
|
seq.append(val)
|