Merge pull request #319 from bellingcat/feat/linkedin-antibot

Antibot Dropin for Linkedin
pull/325/head
Miguel Sozinho Ramalho 2025-06-11 19:42:38 +01:00 zatwierdzone przez GitHub
commit 3d31c7605b
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
24 zmienionych plików z 346 dodań i 38 usunięć

4
.gitignore vendored
Wyświetl plik

@ -1,6 +1,7 @@
tmp*/
temp/
.env*
!.env*.example
.DS_Store
expmt/
service_account.json
@ -37,4 +38,5 @@ docs/source/modules/autogen/
scripts/settings_page.html
scripts/settings/src/schema.json
.vite
downloaded_files
downloaded_files
latest_logs

Wyświetl plik

@ -47,7 +47,6 @@ def generate_module_docs():
for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
# generate the markdown file from the __manifest__.py file.
manifest = module.manifest
for type in manifest["type"]:
modules_by_type.setdefault(type, []).append(module)
@ -64,6 +63,27 @@ def generate_module_docs():
"""
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"])
if manifest.get("autodoc_dropins"):
loaded_module = module.load({})
dropins = loaded_module.load_dropins()
dropin_str = "\n##### Available Dropins\n"
for dropin in dropins:
if not (ddoc := dropin.documentation()):
continue
dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n"
dropin_str += f"{ddoc.get('description')}\n\n"
if ddoc.get("site"):
dropin_str += f"**Site**: {ddoc['site']}\n\n"
if dauth := ddoc.get("authentication"):
dropin_str += "**YAML configuration**:\n"
dropin_auth_yaml = "authentication:\n...\n"
for site, creds in dauth.items():
dropin_auth_yaml += f" {site}:\n"
for k, v in creds.items():
dropin_auth_yaml += f' {k}: "{v}"\n'
dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n"
readme_str += dropin_str
if not manifest["configs"]:
config_string = f"# No configuration options for {module.name}.*\n"
else:

Wyświetl plik

@ -3,14 +3,14 @@
`pytest` is used for testing. There are two main types of tests:
1. 'core' tests which should be run on every change
2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed.
2. 'download' tests which hit the network. These tests will do things like make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed, they take longer.
## Running Tests
1. Make sure you've installed the dev dependencies with `pytest install --with dev`
2. Tests can be run as follows:
```
```{code} bash
#### Command prefix of 'poetry run' removed here for simplicity
# run core tests
pytest -ra -v -m "not download"
@ -18,4 +18,15 @@ pytest -ra -v -m "not download"
pytest -ra -v -m "download"
# run all tests
pytest -ra -v
# run a specific test file
pytest -ra -v tests/test_file.py
# run a specific test function
pytest -ra -v tests/test_file.py::test_function_name
```
3. Some tests require environment variables to be set. You can use the example `.env.test.example` file as a template. Copy it to `.env.test` and fill in the required values. This file will be loaded automatically by `pytest`.
```{code} bash
cp .env.test.example .env.test
```

Wyświetl plik

@ -15,19 +15,29 @@ We have dropped the `vk_extractor` because of problems in a project we relied on
Module 'vk_extractor' not found. Are you sure it's installed/exists?
```
## Dropping `screenshot_enricher` module
We have dropped the `screenshot_enricher` module because a new `antibot_extractor_enricher` (see below) module replaces its functionality more robustly and with less dependency hassle on geckodriver/firefox. You will need to remove it from your configuration file, otherwise you will see an error like:
```{code} console
Module 'screenshot_enricher' not found. Are you sure it's installed/exists?
```
## New `antibot_extractor_enricher` module and VkDropin
We have added a new `antibot_extractor_enricher` module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:
We have added a new [`antibot_extractor_enricher`](../modules/autogen/extractor/antibot_extractor_enricher.md) module that uses a computer-controlled browser to extract content from websites that use anti-bot measures. You can add it to your configuration file like this:
```{code} yaml
steps:
extractors:
- antibot_extractor_enricher
extractors:
- antibot_extractor_enricher
# or alternatively, if you want to use it as an enricher:
enrichers:
- antibot_extractor_enricher
# or alternatively, if you want to use it as an enricher:
enrichers:
- antibot_extractor_enricher
```
It will take a full page screenshot, a PDF capture, extract HTML source code, and any other relevant media.
It comes with Dropins that we will be adding and maintaining.
> Dropin: A module with site-specific behaviours that is loaded automatically. You don't need to add them to your configuration steps for them to run. Sometimes they need `authentication` configurations though.
@ -36,9 +46,9 @@ One such Dropin is the VkDropin which uses this automated browser to access VKon
```{code} yaml
authentication:
vk:
username: your_username
password: your_password
vk.com:
username: your_username
password: your_password
```
See all available Dropins in [the source code](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules/antibot_extractor_enricher/dropins). Usually each Dropin needs its own authentication settings, similarly to the VkDropin.

Wyświetl plik

@ -77,6 +77,8 @@ class Extractor(BaseModule):
downloads a URL to provided filename, or inferred from URL, returns local filename
Warning: if try_best_quality is True, it will return a tuple of (filename, best_quality_url) if the download was successful.
"""
if any(url.startswith(x) for x in ["blob:", "data:"]):
return None, url if try_best_quality else None
if try_best_quality:
with suppress(Exception):
@ -116,6 +118,8 @@ class Extractor(BaseModule):
except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {str(e)[:250]}")
if try_best_quality:
return None, url
@abstractmethod
def download(self, item: Metadata) -> Metadata | False:

Wyświetl plik

@ -34,7 +34,7 @@ from .config import (
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .consts import MODULE_TYPES, SetupError
from auto_archiver.utils.url import check_url_or_raise
from auto_archiver.utils.url import check_url_or_raise, clean
if TYPE_CHECKING:
from .base_module import BaseModule
@ -572,7 +572,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
raise e
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
url = original_url
url = clean(original_url)
for a in self.extractors:
url = a.sanitize_url(url)

Wyświetl plik

@ -31,11 +31,12 @@
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
},
},
"autodoc_dropins": True,
"description": """
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
Still in trial development, please report any issues or suggestions via GitHub Issues.
> Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues).
### Features
- Extracts the HTML source code of the page.
- Takes full-page screenshots of web pages.
@ -44,5 +45,9 @@
### Notes
- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
### Dropins
This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below.
""",
}

Wyświetl plik

@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher):
self._enrich_download_media(
sb,
to_enrich,
css_selector=dropin.images_selectors(),
js_css_selector=dropin.js_for_image_css_selectors(),
max_media=self.max_download_images - downloaded_images,
)
self._enrich_download_media(
sb,
to_enrich,
css_selector=dropin.video_selectors(),
js_css_selector=dropin.js_for_video_css_selectors(),
max_media=self.max_download_videos - downloaded_videos,
)
logger.info(f"ANTIBOT completed for {url_sample}")
@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
to_enrich.add_media(Media(filename=pdf_filename), id="pdf")
@logger.catch
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int):
def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int):
"""
Downloads media from the page and adds them to the Metadata object.
This method is called by the enrich method.
@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
url = to_enrich.get_url()
all_urls = set()
sources = sb.execute_script(f"""
return Array.from(document.querySelectorAll("{css_selector}"))
.map(el => el.src || el.href)
.filter(Boolean);
""")
sources = sb.execute_script(js_css_selector)
# js_for_css_selectors
for src in sources:
if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.")

Wyświetl plik

@ -1,4 +1,5 @@
import os
from typing import Mapping
from loguru import logger
from seleniumbase import SB
import yt_dlp
@ -13,6 +14,19 @@ class Dropin:
This class is designed to be a base class for drop-ins that can handle specific websites.
"""
@staticmethod
def documentation() -> Mapping[str, str]:
"""
Each Dropin should auto-document itself with this method.
Return dictionary can include:
- 'name': A string representing the name of the dropin.
- 'description': A string describing the functionality of the dropin.
- 'site': A string representing the site this dropin is for.
- 'authentication': A dictionary with authentication example for the site.
"""
return {}
def __init__(self, sb: SB, extractor: Extractor):
"""
Initialize the Dropin with the given SeleniumBase instance.
@ -53,6 +67,26 @@ class Dropin:
"""
return "video, source"
def js_for_image_css_selectors(self) -> str:
"""
A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection.
You can overwrite this instead of `images_selector` for more control over scraped images.
"""
return f"""
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
"""
def js_for_video_css_selectors(self) -> str:
"""
A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection.
You can overwrite this instead of `video_selector` for more control over scraped videos.
"""
return f"""
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
"""
def open_page(self, url) -> bool:
"""
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@ -66,7 +100,7 @@ class Dropin:
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
:return: A tuple (number of Images added, number of Videos added).
"""
raise NotImplementedError("This method should be implemented in the subclass")
return 0, 0
def _get_username_password(self, site) -> tuple[str, str]:
"""

Wyświetl plik

@ -1,4 +1,3 @@
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@ -13,6 +12,3 @@ class DefaultDropin(Dropin):
def open_page(self, url) -> bool:
return True
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
return 0, 0

Wyświetl plik

@ -0,0 +1,74 @@
from typing import Mapping
from loguru import logger
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
class LinkedinDropin(Dropin):
"""
A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module.
"""
@staticmethod
def documentation() -> Mapping[str, str]:
return {
"name": "Linkedin Dropin",
"description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.",
"site": "linkedin.com",
"authentication": {
"linkedin.com": {
"username": "email address or phone number",
"password": "password",
}
},
}
notifications_css_selector = 'a[href*="linkedin.com/notifications"]'
@staticmethod
def suitable(url: str) -> bool:
return "linkedin.com" in url
def js_for_image_css_selectors(self) -> str:
get_all_css = "main img:not([src*='profile-displayphoto']):not([src*='profile-framedphoto'])"
get_first_css = (
"main img[src*='profile-framedphoto'], main img[src*='profile-displayphoto'], main img[src*='company-logo']"
)
return f"""
const all = Array.from(document.querySelectorAll("{get_all_css}")).map(el => el.src || el.href).filter(Boolean);
const profile = document.querySelector("{get_first_css}");
return all.concat(profile?.src || profile?.href || []).filter(Boolean);
"""
@staticmethod
def video_selectors() -> str:
# usually videos are from blob: but running the generic extractor should handle that
return "main video"
def open_page(self, url) -> bool:
if not self.sb.is_element_present(self.notifications_css_selector):
self._login()
if url != self.sb.get_current_url():
self.sb.open(url)
return True
@logger.catch
def _login(self) -> bool:
if self.sb.is_text_visible("Sign in to view more content"):
self.sb.click_link_text("Sign in", timeout=2)
self.sb.wait_for_ready_state_complete()
else:
self.sb.open("https://www.linkedin.com/login")
self.sb.wait_for_ready_state_complete()
username, password = self._get_username_password("linkedin.com")
logger.debug("LinkedinDropin Logging in to Linkedin with username: {}", username)
self.sb.type("#username", username)
self.sb.type("#password", password)
self.sb.click_if_visible("#password-visibility-toggle", timeout=0.5)
self.sb.click("button[type='submit']")
self.sb.wait_for_ready_state_complete()
# TODO: on suspicious login, LinkedIn may require an email verification code
if not self.sb.is_element_present(self.notifications_css_selector):
self.sb.click_if_visible('button[aria-label="Dismiss"]', timeout=0.5)

Wyświetl plik

@ -1,4 +1,5 @@
from contextlib import suppress
from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@ -10,6 +11,19 @@ class RedditDropin(Dropin):
A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
"""
def documentation() -> Mapping[str, str]:
return {
"name": "Reddit Dropin",
"description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.",
"site": "reddit.com",
"authentication": {
"reddit.com": {
"username": "email address or username",
"password": "password",
}
},
}
@staticmethod
def suitable(url: str) -> bool:
return "reddit.com" in url
@ -36,7 +50,7 @@ class RedditDropin(Dropin):
self._close_cookies_banner()
username, password = self._get_username_password("reddit.com")
logger.debug("RedditDropin Logging in to VK with username: {}", username)
logger.debug("RedditDropin Logging in to Reddit with username: {}", username)
self.sb.type("#login-username", username)
self.sb.type("#login-password", password)

Wyświetl plik

@ -1,4 +1,5 @@
import re
from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@ -16,6 +17,19 @@ class VkDropin(Dropin):
CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
def documentation() -> Mapping[str, str]:
return {
"name": "VKontakte Dropin",
"description": "Handles VKontakte posts and works without authentication for some content.",
"site": "vk.com",
"authentication": {
"vk.com": {
"username": "phone number with country code",
"password": "password",
}
},
}
@staticmethod
def suitable(url: str) -> bool:
return "vk.com" in url
@ -39,7 +53,7 @@ class VkDropin(Dropin):
@logger.catch
def _login(self) -> bool:
# TODO: test method
# TODO: test method, because current tests work without a login
self.sb.open("https://vk.com")
self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url():

Wyświetl plik

@ -7,8 +7,7 @@ from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
from auto_archiver.core.extractor import Extractor
from .dropin import GenericDropin, InfoExtractor
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
class Twitter(GenericDropin):

Wyświetl plik

@ -1,5 +1,5 @@
import re
from urllib.parse import urlparse, urlunparse
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
from ipaddress import ip_address
@ -53,7 +53,11 @@ def domain_for_url(url: str) -> str:
def clean(url: str) -> str:
return url
TRACKERS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"}
parsed = urlparse(url)
clean_qs = [(k, v) for k, v in parse_qsl(parsed.query) if k not in TRACKERS]
return parsed._replace(query=urlencode(clean_qs)).geturl()
def is_auth_wall(url: str) -> bool:
@ -109,6 +113,8 @@ def is_relevant_url(url: str) -> bool:
# reddit
("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners
("emoji.redditmedia.com",),
# linkedin
("static.licdn.com",),
]
# TODO: make these globally configurable

Wyświetl plik

@ -0,0 +1,10 @@
# ANTIBOT reddit test credentials
REDDIT_TEST_USERNAME=""
REDDIT_TEST_PASSWORD=""
# ANTIBOT linkedin test credentials
LINKEDIN_TEST_USERNAME=""
LINKEDIN_TEST_PASSWORD=""
# twitter test credentials
TWITTER_BEARER_TOKEN="TEST_KEY"

Wyświetl plik

@ -9,6 +9,7 @@ from tempfile import TemporaryDirectory
from typing import Dict, Tuple
import hashlib
from loguru import logger
import pytest
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.module import ModuleFactory
@ -20,6 +21,24 @@ from auto_archiver.core.module import ModuleFactory
TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
def pytest_configure():
# load environment variables from .env.test file.
env_path = os.path.join(os.path.dirname(__file__), ".env.test")
if os.path.exists(env_path):
with open(env_path) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" in line:
key, value = line.split("=", 1)
os.environ[key.strip()] = value.strip().lstrip('"').rstrip('"')
else:
logger.warning(
f"Environment file {env_path} not found. Skipping loading environment variables, some tests may fail."
)
# don't check for ytdlp updates in tests
@pytest.fixture(autouse=True)
def skip_check_for_update(mocker):

Wyświetl plik

@ -1,3 +1,4 @@
import os
import pytest
from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher import AntibotExtractorEnricher
@ -34,7 +35,18 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
"save_to_pdf": False,
"max_download_images": 0,
"max_download_videos": 0,
"user_data_dir": "./tests/tmp/user_data",
"proxy": None,
"authentication": {
"reddit.com": {
"username": os.environ.get("REDDIT_TEST_USERNAME"),
"password": os.environ.get("REDDIT_TEST_PASSWORD"),
},
"linkedin.com": {
"username": os.environ.get("LINKEDIN_TEST_USERNAME"),
"password": os.environ.get("LINKEDIN_TEST_PASSWORD"),
},
},
}
@pytest.mark.download
@ -82,10 +94,10 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
"""
Test downloading pages with media.
"""
self.extractor = setup_module(
self.extractor_module,
{
self.config
| {
"save_to_pdf": True,
"max_download_images": 5,
"max_download_videos": "inf",
@ -118,6 +130,50 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
f"Expected media with id '{expected_id}' not found"
)
@pytest.mark.skipif(
not os.environ.get("REDDIT_TEST_USERNAME") or not os.environ.get("REDDIT_TEST_PASSWORD"),
reason="No Reddit test credentials provided",
)
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_title,in_text,image_count,video_count",
[
(
"https://www.reddit.com/r/BeAmazed/comments/1l6b1n4/duy_tran_is_the_owner_and_prime_wood_work_artist/",
"Duy tran is the owner and prime wood work artist",
" Created Jan 26, 2015",
4,
0,
),
],
)
def test_reddit_download_with_login(
self, setup_module, make_item, url, in_title, in_text, image_count, video_count
):
self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
@pytest.mark.skipif(
not os.environ.get("LINKEDIN_TEST_USERNAME") or not os.environ.get("LINKEDIN_TEST_PASSWORD"),
reason="No LinkedIn test credentials provided",
)
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_title,in_text,image_count,video_count",
[
(
"https://www.linkedin.com/posts/bellingcat_live-podcast-bellingcat-activity-7331725631799398400-xocM/",
"Post",
"It takes time to go from hunch to reporting...",
2,
0,
),
],
)
def test_linkedin_download_with_login(
self, setup_module, make_item, url, in_title, in_text, image_count, video_count
):
self.test_download_pages_with_media(setup_module, make_item, url, in_title, in_text, image_count, video_count)
@pytest.mark.download
@pytest.mark.parametrize(
"url,in_html",

Wyświetl plik

@ -1,5 +1,6 @@
import pytest
from auto_archiver.utils.url import (
clean,
is_auth_wall,
check_url_or_raise,
domain_for_url,
@ -158,3 +159,39 @@ def test_twitter_best_quality_url(url, best_quality):
)
def test_get_media_url_best_quality(input_url, expected_url):
assert get_media_url_best_quality(input_url) == expected_url
@pytest.mark.parametrize(
"input_url,expected_url",
[
# No trackers present
("https://example.com/page?foo=bar&baz=qux", "https://example.com/page?foo=bar&baz=qux"),
# Single tracker present
("https://example.com/page?utm_source=google&foo=bar", "https://example.com/page?foo=bar"),
# Multiple trackers present
("https://example.com/page?utm_source=google&utm_medium=email&utm_campaign=spring", "https://example.com/page"),
# Trackers mixed with other params
(
"https://example.com/page?foo=bar&utm_content=abc&baz=qux&gclid=123",
"https://example.com/page?foo=bar&baz=qux",
),
# Only trackers present
("https://example.com/page?utm_source=google&gclid=123", "https://example.com/page"),
# No query string
("https://example.com/page", "https://example.com/page"),
# Trackers in fragment (should not be removed)
("https://example.com/page#utm_source=google", "https://example.com/page#utm_source=google"),
# Trackers after fragment
("https://example.com/page?utm_source=google#section-1", "https://example.com/page#section-1"),
# Trackers with empty value
("https://example.com/page?utm_source=&foo=bar", "https://example.com/page?foo=bar"),
# Trackers with multiple values
("https://example.com/page?utm_source=google&utm_source=bing&foo=bar", "https://example.com/page?foo=bar"),
# Trackers with encoded values
("https://example.com/page?utm_source=google%20ads&foo=bar", "https://example.com/page?foo=bar"),
# Unrelated param with similar name
("https://example.com/page?utm_sourc=keepme&foo=bar", "https://example.com/page?utm_sourc=keepme&foo=bar"),
],
)
def test_clean_removes_trackers(input_url, expected_url):
assert clean(input_url) == expected_url