bridgy-fed/common.py

# coding=utf-8
"""Misc common utilities.
"""
import copy
from datetime import timedelta
import logging
import re
import threading
import urllib.parse

import cachetools
from flask import abort, g, make_response, request
from granary import as1, as2, microformats2
import mf2util
from oauth_dropins.webutil import util, webmention
from oauth_dropins.webutil.appengine_info import DEBUG
from oauth_dropins.webutil.util import json_dumps, json_loads

logger = logging.getLogger(__name__)

DOMAIN_RE = r'[^/:]+\.[^/:]+'
TLD_BLOCKLIST = ('7z', 'asp', 'aspx', 'gif', 'html', 'ico', 'jpg', 'jpeg', 'js',
                 'json', 'php', 'png', 'rar', 'txt', 'yaml', 'yml', 'zip')

CONTENT_TYPE_HTML = 'text/html; charset=utf-8'

PRIMARY_DOMAIN = 'fed.brid.gy'
OTHER_DOMAINS = (
    'bridgy-federated.appspot.com',
    'bridgy-federated.uc.r.appspot.com',
)
LOCAL_DOMAINS = (
  'localhost',
  'localhost:8080',
  'my.dev.com:8080',
)
DOMAINS = (PRIMARY_DOMAIN,) + OTHER_DOMAINS + LOCAL_DOMAINS
# TODO: unify with Bridgy's
DOMAIN_BLOCKLIST = frozenset((
    # https://github.com/snarfed/bridgy-fed/issues/348
    'aaronparecki.com',
    'facebook.com',
    'fb.com',
    't.co',
    'twitter.com',
) + DOMAINS)

CACHE_TIME = timedelta(seconds=60)


def host_url(path_query=None):
  base = request.host_url
  if (util.domain_or_parent_in(request.host, OTHER_DOMAINS) or
      # when running locally against prod datastore
      (not DEBUG and request.host in LOCAL_DOMAINS)):
    base = f'https://{PRIMARY_DOMAIN}'

  return urllib.parse.urljoin(base, path_query)


def error(msg, status=400, exc_info=None, **kwargs):
    """Like flask_util.error, but wraps body in JSON."""
    logger.info(f'Returning {status}: {msg}', exc_info=exc_info)
    abort(status, response=make_response({'error': msg}, status), **kwargs)


def pretty_link(url, text=None, **kwargs):
  """Wrapper around util.pretty_link() that converts Mastodon user URLs to @-@.

  Eg for URLs like https://mastodon.social/@foo and
  https://mastodon.social/users/foo, defaults text to @foo@mastodon.social if
  it's not provided.

  Args:
    url: str
    text: str
    kwargs: passed through to :func:`webutil.util.pretty_link`
  """
  if g.user and g.user.is_homepage(url):
    return g.user.user_page_link()

  if text is None:
    match = re.match(r'https?://([^/]+)/(@|users/)([^/]+)$', url)
    if match:
      text = match.expand(r'@\3@\1')

  return util.pretty_link(url, text=text, **kwargs)


def content_type(resp):
    """Returns a :class:`requests.Response`'s Content-Type, without charset suffix."""
    type = resp.headers.get('Content-Type')
    if type:
        return type.split(';')[0]


def remove_blocklisted(urls):
    """Returns the subset of input URLs that aren't in our domain blocklist.

    Args:
      urls: sequence of str

    Returns: list of str
    """
    return [u for u in urls if not util.domain_or_parent_in(
              util.domain_from_link(u), DOMAIN_BLOCKLIST)]


def redirect_wrap(url):
    """Returns a URL on our domain that redirects to this URL.

    ...to satisfy Mastodon's non-standard domain matching requirement. :(

    Args:
      url: string

    https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599
    https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747

    Returns: string, redirect url
    """
    if not url or url.startswith(host_url()):
        return url

    return host_url('/r/') + url


def redirect_unwrap(val):
    """Removes our redirect wrapping from a URL, if it's there.

    val may be a string, dict, or list. dicts and lists are unwrapped
    recursively.

    Strings that aren't wrapped URLs are left unchanged.

    Args:
      val: string or dict or list

    Returns: string, unwrapped url
    """
    if isinstance(val, dict):
        return {k: redirect_unwrap(v) for k, v in val.items()}

    elif isinstance(val, list):
        return [redirect_unwrap(v) for v in val]

    elif isinstance(val, str):
        prefix = host_url('/r/')
        if val.startswith(prefix):
            unwrapped = val.removeprefix(prefix)
            if util.is_web(unwrapped):
                return unwrapped
        elif val.startswith(host_url()):
            path = val.removeprefix(host_url())
            if re.match(DOMAIN_RE, path):
                return f'https://{path}/'

    return val


def webmention_endpoint_cache_key(url):
  """Returns cache key for a cached webmention endpoint for a given URL.

  Just the domain by default. If the URL is the home page, ie path is / , the
  key includes a / at the end, so that we cache webmention endpoints for home
  pages separate from other pages. https://github.com/snarfed/bridgy/issues/701

  Example: 'snarfed.org /'

  https://github.com/snarfed/bridgy-fed/issues/423

  Adapted from bridgy/util.py.
  """
  parsed = urllib.parse.urlparse(url)
  key = parsed.netloc
  if parsed.path in ('', '/'):
    key += ' /'

  # logger.debug(f'wm cache key {key}')
  return key


@cachetools.cached(cachetools.TTLCache(50000, 60 * 60 * 2),  # 2h expiration
                   key=webmention_endpoint_cache_key,
                   lock=threading.Lock(),
                   info=True)
def webmention_discover(url, **kwargs):
    """Thin caching wrapper around :func:`web.discover`."""
    return webmention.discover(url, **kwargs)
webfinger: settle on 'me' username, just domain in paths 2017-09-12 14:31:50 +00:00			`# coding=utf-8`
extract out requests.get() and user agent header into new common.py file 2017-08-15 06:07:24 +00:00			`"""Misc common utilities.`
			`"""`
handle full conneg Accept header parsing in /r/ handler for #352 2022-12-26 05:09:34 +00:00			`import copy`
start to separate logic from protocols with new Protocol/ActivityPub classes for #388 2023-03-08 21:10:41 +00:00			`from datetime import timedelta`
extract out requests.get() and user agent header into new common.py file 2017-08-15 06:07:24 +00:00			`import logging`
webfinger: discover and return PuSH hub header links 2017-09-13 14:48:32 +00:00			`import re`
cache webmention endpoint discovery for 2h 2023-03-11 06:24:58 +00:00			`import threading`
migrate to the app engine standard python 3 runtime! https://cloud.google.com/appengine/docs/standard/python3/python-differences https://cloud.google.com/appengine/docs/standard/python/migrate-to-python3/ 2019-12-26 06:20:57 +00:00			`import urllib.parse`
extract out requests.get() and user agent header into new common.py file 2017-08-15 06:07:24 +00:00
cache webmention endpoint discovery for 2h 2023-03-11 06:24:58 +00:00			`import cachetools`
move current user into Flask g request-global 2023-03-20 21:28:14 +00:00			`from flask import abort, g, make_response, request`
Activity => Object: populate Object.type #286 2023-01-29 22:13:58 +00:00			`from granary import as1, as2, microformats2`
move activitypub.actor() to common so I can reuse it 2022-11-19 02:49:34 +00:00			`import mf2util`
cache webmention endpoint discovery for 2h 2023-03-11 06:24:58 +00:00			`from oauth_dropins.webutil import util, webmention`
common.host_url: use fed.brid.gy when running locally against prod datastore 2023-01-08 20:01:31 +00:00			`from oauth_dropins.webutil.appengine_info import DEBUG`
move activitypub.actor() to common so I can reuse it 2022-11-19 02:49:34 +00:00			`from oauth_dropins.webutil.util import json_dumps, json_loads`
unify webmention sending from salmon and activitypub into common 2017-10-16 14:13:43 +00:00
logging: use separate loggers for each module with their names 2022-02-12 06:38:56 +00:00			`logger = logging.getLogger(__name__)`

refactoring: unify followers and following AP collection endpoints 2023-01-19 04:32:23 +00:00			`DOMAIN_RE = r'[^/:]+\.[^/:]+'`
add .7z, .zip, and .rar to TLD blocklist 2020-03-03 15:57:52 +00:00			`TLD_BLOCKLIST = ('7z', 'asp', 'aspx', 'gif', 'html', 'ico', 'jpg', 'jpeg', 'js',`
			`'json', 'php', 'png', 'rar', 'txt', 'yaml', 'yml', 'zip')`
webfinger: settle on 'me' username, just domain in paths 2017-09-12 14:31:50 +00:00
migrate to the app engine standard python 3 runtime! https://cloud.google.com/appengine/docs/standard/python3/python-differences https://cloud.google.com/appengine/docs/standard/python/migrate-to-python3/ 2019-12-26 06:20:57 +00:00			`CONTENT_TYPE_HTML = 'text/html; charset=utf-8'`
activitypub: follow HTML link rels that point to AS2 objects ...by reusing common.get_as2(). also lots of misc refactoring. 2017-10-20 14:49:25 +00:00
webfinger: handle acct:[domain]@fed.brid.gy requests fixes #73 2020-12-30 18:26:48 +00:00			`PRIMARY_DOMAIN = 'fed.brid.gy'`
			`OTHER_DOMAINS = (`
			`'bridgy-federated.appspot.com',`
background task bug fix: override task runner host we were using request.host_url blindly, which was fed.brid.gy for user-facing requests, but bridgy-federated.uc.r.appspot.com for tasks. #335 2023-01-05 23:03:21 +00:00			`'bridgy-federated.uc.r.appspot.com',`
webfinger: handle acct:[domain]@fed.brid.gy requests fixes #73 2020-12-30 18:26:48 +00:00			`)`
background task bug fix: override task runner host we were using request.host_url blindly, which was fed.brid.gy for user-facing requests, but bridgy-federated.uc.r.appspot.com for tasks. #335 2023-01-05 23:03:21 +00:00			`LOCAL_DOMAINS = (`
			`'localhost',`
			`'localhost:8080',`
			`'my.dev.com:8080',`
			`)`
			`DOMAINS = (PRIMARY_DOMAIN,) + OTHER_DOMAINS + LOCAL_DOMAINS`
add domain blocklist, right now primarily for Twitter 2022-11-14 15:07:33 +00:00			`# TODO: unify with Bridgy's`
			`DOMAIN_BLOCKLIST = frozenset((`
add aaronparecki.com to blocklist for #348 2022-12-28 14:34:22 +00:00			`# https://github.com/snarfed/bridgy-fed/issues/348`
			`'aaronparecki.com',`
add domain blocklist, right now primarily for Twitter 2022-11-14 15:07:33 +00:00			`'facebook.com',`
			`'fb.com',`
			`'t.co',`
			`'twitter.com',`
			`) + DOMAINS)`
activitypub: point actor ids to bridgy fed, not source web site ...since we want to serve them as AS2. for tootsuite/mastodon#5500 2017-10-24 04:49:43 +00:00
cache more aggressively: bump expiration up to 60s, threshold to 3k for #378 2023-01-25 00:13:22 +00:00			`CACHE_TIME = timedelta(seconds=60)`
AP: verify incoming signatures! for #315 2023-02-15 18:57:11 +00:00
extract out requests.get() and user agent header into new common.py file 2017-08-15 06:07:24 +00:00
background task bug fix: override task runner host we were using request.host_url blindly, which was fed.brid.gy for user-facing requests, but bridgy-federated.uc.r.appspot.com for tasks. #335 2023-01-05 23:03:21 +00:00			`def host_url(path_query=None):`
common.host_url: use fed.brid.gy when running locally against prod datastore 2023-01-08 20:01:31 +00:00			`base = request.host_url`
			`if (util.domain_or_parent_in(request.host, OTHER_DOMAINS) or`
			`# when running locally against prod datastore`
			`(not DEBUG and request.host in LOCAL_DOMAINS)):`
			`base = f'https://{PRIMARY_DOMAIN}'`

background task bug fix: override task runner host we were using request.host_url blindly, which was fed.brid.gy for user-facing requests, but bridgy-federated.uc.r.appspot.com for tasks. #335 2023-01-05 23:03:21 +00:00			`return urllib.parse.urljoin(base, path_query)`


common.error bug fix, accept exc_info kwarg 2023-04-04 14:14:31 +00:00			`def error(msg, status=400, exc_info=None, **kwargs):`
start to separate logic from protocols with new Protocol/ActivityPub classes for #388 2023-03-08 21:10:41 +00:00			`"""Like flask_util.error, but wraps body in JSON."""`
common.error bug fix, accept exc_info kwarg 2023-04-04 14:14:31 +00:00			`logger.info(f'Returning {status}: {msg}', exc_info=exc_info)`
			`abort(status, response=make_response({'error': msg}, status), **kwargs)`
extract out requests.get() and user agent header into new common.py file 2017-08-15 06:07:24 +00:00

move current user into Flask g request-global 2023-03-20 21:28:14 +00:00			`def pretty_link(url, text=None, **kwargs):`
user page: activities: render fediverse actor links as @-@ for #406 2023-02-07 05:08:52 +00:00			`"""Wrapper around util.pretty_link() that converts Mastodon user URLs to @-@.`

			`Eg for URLs like https://mastodon.social/@foo and`
			`https://mastodon.social/users/foo, defaults text to @foo@mastodon.social if`
			`it's not provided.`
user page activities: if object is user, render as pretty user link for #406 2023-02-07 05:28:40 +00:00
			`Args:`
			`url: str`
			`text: str`
add mf2 to user page dashboard to make a notifications feed for #442 2023-03-14 13:54:16 +00:00			kwargs: passed through to :func:`webutil.util.pretty_link`
user page: activities: render fediverse actor links as @-@ for #406 2023-02-07 05:08:52 +00:00			`"""`
move current user into Flask g request-global 2023-03-20 21:28:14 +00:00			`if g.user and g.user.is_homepage(url):`
			`return g.user.user_page_link()`
user page activities: if object is user, render as pretty user link for #406 2023-02-07 05:28:40 +00:00
user page: activities: render fediverse actor links as @-@ for #406 2023-02-07 05:08:52 +00:00			`if text is None:`
			`match = re.match(r'https?://([^/]+)/(@\|users/)([^/]+)$', url)`
			`if match:`
			`text = match.expand(r'@\3@\1')`

add mf2 to user page dashboard to make a notifications feed for #442 2023-03-14 13:54:16 +00:00			`return util.pretty_link(url, text=text, **kwargs)`
user page: activities: render fediverse actor links as @-@ for #406 2023-02-07 05:08:52 +00:00

fetching AS2: handle Content-Type with charset suffix 2017-10-21 03:35:07 +00:00			`def content_type(resp):`
flask: add common.RegexConverter based on https://github.com/rhyselsmore/flask-reggie 2021-07-12 20:49:48 +00:00			"""Returns a :class:`requests.Response`'s Content-Type, without charset suffix."""
fetching AS2: handle Content-Type with charset suffix 2017-10-21 03:35:07 +00:00			`type = resp.headers.get('Content-Type')`
			`if type:`
			`return type.split(';')[0]`


add domain blocklist, right now primarily for Twitter 2022-11-14 15:07:33 +00:00			`def remove_blocklisted(urls):`
			`"""Returns the subset of input URLs that aren't in our domain blocklist.`

			`Args:`
			`urls: sequence of str`

			`Returns: list of str`
			`"""`
			`return [u for u in urls if not util.domain_or_parent_in(`
			`util.domain_from_link(u), DOMAIN_BLOCKLIST)]`


flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`def redirect_wrap(url):`
			`"""Returns a URL on our domain that redirects to this URL.`
wrap activity ids and urls and actor ids in our /r/ endpoint for #16, #32. cc @swentel. 2018-10-14 14:58:17 +00:00
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`...to satisfy Mastodon's non-standard domain matching requirement. :(`
wrap activity ids and urls and actor ids in our /r/ endpoint for #16, #32. cc @swentel. 2018-10-14 14:58:17 +00:00
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`Args:`
			`url: string`
AP => wm: remove our redirect from target URLs fixes the second half of #35 2018-10-17 14:00:31 +00:00
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`https://github.com/snarfed/bridgy-fed/issues/16#issuecomment-424799599`
			`https://github.com/tootsuite/mastodon/pull/6219#issuecomment-429142747`
AP => wm: remove our redirect from target URLs fixes the second half of #35 2018-10-17 14:00:31 +00:00
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`Returns: string, redirect url`
			`"""`
continue abstracting logic in Protocol class lots more to do. also convert Object.as1 to AS2 in ActivityPub.send if necessary. 2023-03-21 02:17:55 +00:00			`if not url or url.startswith(host_url()):`
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`return url`
AP => wm: remove our redirect from target URLs fixes the second half of #35 2018-10-17 14:00:31 +00:00
continue abstracting logic in Protocol class lots more to do. also convert Object.as1 to AS2 in ActivityPub.send if necessary. 2023-03-21 02:17:55 +00:00			`return host_url('/r/') + url`
unify request handler classes and handle_exception; move away from HOST[_URL] fixes: * https://console.cloud.google.com/errors/CN68rO-5sOK9cQ * https://console.cloud.google.com/errors/CJWCu8b5_ureAg * https://console.cloud.google.com/errors/CN6W4Zy7irzgOA * https://console.cloud.google.com/errors/CN278MyjhZbtOQ 2020-01-31 15:38:58 +00:00

flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`def redirect_unwrap(val):`
			`"""Removes our redirect wrapping from a URL, if it's there.`
unify request handler classes and handle_exception; move away from HOST[_URL] fixes: * https://console.cloud.google.com/errors/CN68rO-5sOK9cQ * https://console.cloud.google.com/errors/CJWCu8b5_ureAg * https://console.cloud.google.com/errors/CN6W4Zy7irzgOA * https://console.cloud.google.com/errors/CN278MyjhZbtOQ 2020-01-31 15:38:58 +00:00
harden URL unwrapping a bit, require http(s) URL or domain fixes https://console.cloud.google.com/errors/detail/CMKn7tqbq-GIRA;time=P30D?project=bridgy-federated 2022-12-10 17:01:04 +00:00			`val may be a string, dict, or list. dicts and lists are unwrapped`
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`recursively.`

			`Strings that aren't wrapped URLs are left unchanged.`

			`Args:`
harden URL unwrapping a bit, require http(s) URL or domain fixes https://console.cloud.google.com/errors/detail/CMKn7tqbq-GIRA;time=P30D?project=bridgy-federated 2022-12-10 17:01:04 +00:00			`val: string or dict or list`
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00
			`Returns: string, unwrapped url`
			`"""`
			`if isinstance(val, dict):`
			`return {k: redirect_unwrap(v) for k, v in val.items()}`
AP => wm: switch to proxying reply wm source, unwrap URLs for #35 example: https://webmention.io/realize.be/webmention/hG654HOgobymY54pMaNS 2018-10-17 14:49:04 +00:00
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`elif isinstance(val, list):`
			`return [redirect_unwrap(v) for v in val]`
convert AP mentions to home page webmentions for #46 2019-04-16 14:59:29 +00:00
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`elif isinstance(val, str):`
background task bug fix: override task runner host we were using request.host_url blindly, which was fed.brid.gy for user-facing requests, but bridgy-federated.uc.r.appspot.com for tasks. #335 2023-01-05 23:03:21 +00:00			`prefix = host_url('/r/')`
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`if val.startswith(prefix):`
harden URL unwrapping a bit, require http(s) URL or domain fixes https://console.cloud.google.com/errors/detail/CMKn7tqbq-GIRA;time=P30D?project=bridgy-federated 2022-12-10 17:01:04 +00:00			`unwrapped = val.removeprefix(prefix)`
			`if util.is_web(unwrapped):`
common.redirect_unwrap: stop following redirects when unwrapping actor URLs 2023-02-20 16:28:16 +00:00			`return unwrapped`
background task bug fix: override task runner host we were using request.host_url blindly, which was fed.brid.gy for user-facing requests, but bridgy-federated.uc.r.appspot.com for tasks. #335 2023-01-05 23:03:21 +00:00			`elif val.startswith(host_url()):`
			`path = val.removeprefix(host_url())`
harden URL unwrapping a bit, require http(s) URL or domain fixes https://console.cloud.google.com/errors/detail/CMKn7tqbq-GIRA;time=P30D?project=bridgy-federated 2022-12-10 17:01:04 +00:00			`if re.match(DOMAIN_RE, path):`
common.redirect_unwrap: stop following redirects when unwrapping actor URLs 2023-02-20 16:28:16 +00:00			`return f'https://{path}/'`
convert AP mentions to home page webmentions for #46 2019-04-16 14:59:29 +00:00
flask: port /r/, start to port common 2021-07-08 04:02:13 +00:00			`return val`
move activitypub.actor() to common so I can reuse it 2022-11-19 02:49:34 +00:00

fix webmention endpoint discovery cache key #423 2023-03-11 20:14:48 +00:00			`def webmention_endpoint_cache_key(url):`
			`"""Returns cache key for a cached webmention endpoint for a given URL.`

			`Just the domain by default. If the URL is the home page, ie path is / , the`
			`key includes a / at the end, so that we cache webmention endpoints for home`
			`pages separate from other pages. https://github.com/snarfed/bridgy/issues/701`

			`Example: 'snarfed.org /'`

			`https://github.com/snarfed/bridgy-fed/issues/423`

			`Adapted from bridgy/util.py.`
			`"""`
			`parsed = urllib.parse.urlparse(url)`
			`key = parsed.netloc`
			`if parsed.path in ('', '/'):`
			`key += ' /'`

			`# logger.debug(f'wm cache key {key}')`
			`return key`


cache webmention endpoint discovery for 2h 2023-03-11 06:24:58 +00:00			`@cachetools.cached(cachetools.TTLCache(50000, 60 * 60 * 2), # 2h expiration`
fix webmention endpoint discovery cache key #423 2023-03-11 20:14:48 +00:00			`key=webmention_endpoint_cache_key,`
			`lock=threading.Lock(),`
			`info=True)`
cache webmention endpoint discovery for 2h 2023-03-11 06:24:58 +00:00			`def webmention_discover(url, **kwargs):`
rename Webmention class => Web, webmention.py => web.py 2023-05-27 00:40:29 +00:00			"""Thin caching wrapper around :func:`web.discover`."""
cache webmention endpoint discovery for 2h 2023-03-11 06:24:58 +00:00			`return webmention.discover(url, **kwargs)`