From fabe469a53e41d0490b4500c567f5e49af261e3e Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Thu, 4 Aug 2022 19:13:32 +0200 Subject: [PATCH] Add missing file --- app/utils/privacy_replace.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 app/utils/privacy_replace.py diff --git a/app/utils/privacy_replace.py b/app/utils/privacy_replace.py new file mode 100644 index 0000000..c3795b6 --- /dev/null +++ b/app/utils/privacy_replace.py @@ -0,0 +1,37 @@ +from urllib.parse import urlparse + +from bs4 import BeautifulSoup # type: ignore +from loguru import logger + +from app.config import PRIVACY_REPLACE + + +def replace_content(content: str) -> str: + if not PRIVACY_REPLACE: + return content + + soup = BeautifulSoup(content, "html5lib") + links = list(soup.find_all("a", href=True)) + if not links: + return content + + for link in links: + link.attrs["href"] = replace_url(link.attrs["href"]) + + return soup.find("body").decode_contents() + + +def replace_url(u: str) -> str: + if not PRIVACY_REPLACE: + return u + + try: + parsed_href = urlparse(u) + except Exception: + logger.warning(f"Failed to parse url={u}") + return u + + if new_netloc := PRIVACY_REPLACE.get(parsed_href.netloc.removeprefix("www.")): + return parsed_href._replace(netloc=new_netloc).geturl() + + return u