Replace urllib by requests in contentproviders

requests is globally simpler to use, and more and more people
are more familiar with this later than urllib.
pull/993/head
David Douard 2020-12-10 18:55:14 +01:00
rodzic 560b1d96a0
commit 830a9c89c0
12 zmienionych plików z 427 dodań i 397 usunięć

Wyświetl plik

@ -5,4 +5,4 @@ pytest>=4.6
wheel wheel
pytest-cov pytest-cov
pre-commit pre-commit
requests requests_mock

Wyświetl plik

@ -2,7 +2,6 @@ import os
import json import json
import shutil import shutil
from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs from urllib.parse import urlparse, urlunparse, parse_qs
from .doi import DoiProvider from .doi import DoiProvider
@ -56,7 +55,6 @@ class Dataverse(DoiProvider):
return return
query_args = parse_qs(parsed_url.query) query_args = parse_qs(parsed_url.query)
# Corner case handling # Corner case handling
if parsed_url.path.startswith("/file.xhtml"): if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
@ -75,8 +73,7 @@ class Dataverse(DoiProvider):
parsed_url._replace(path="/api/search", query=search_query) parsed_url._replace(path="/api/search", query=search_query)
) )
self.log.debug("Querying Dataverse: " + search_url) self.log.debug("Querying Dataverse: " + search_url)
resp = self.urlopen(search_url).read() data = self.urlopen(search_url).json()
data = json.loads(resp.decode("utf-8"))["data"]
if data["count_in_response"] != 1: if data["count_in_response"] != 1:
self.log.debug( self.log.debug(
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format( "Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
@ -101,14 +98,12 @@ class Dataverse(DoiProvider):
host = spec["host"] host = spec["host"]
yield "Fetching Dataverse record {}.\n".format(record_id) yield "Fetching Dataverse record {}.\n".format(record_id)
req = Request( url = "{}/api/datasets/:persistentId?persistentId={}".format(
"{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id host["url"], record_id
),
headers={"accept": "application/json"},
) )
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"] resp = self.urlopen(url, headers={"accept": "application/json"})
record = resp.json()
for fobj in deep_get(record, "latestVersion.files"): for fobj in deep_get(record, "latestVersion.files"):
file_url = "{}/api/access/datafile/{}".format( file_url = "{}/api/access/datafile/{}".format(

Wyświetl plik

@ -5,8 +5,8 @@ import logging
from os import makedirs from os import makedirs
from os import path from os import path
from urllib import request # urlopen, Request from requests import Session, HTTPError
from urllib.error import HTTPError
from zipfile import ZipFile, is_zipfile from zipfile import ZipFile, is_zipfile
from .base import ContentProvider from .base import ContentProvider
@ -18,7 +18,21 @@ from .. import __version__
class DoiProvider(ContentProvider): class DoiProvider(ContentProvider):
"""Provide contents of a repository identified by a DOI and some helper functions.""" """Provide contents of a repository identified by a DOI and some helper functions."""
def urlopen(self, req, headers=None): def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)
urlopen = _request
def _urlopen(self, req, headers=None):
"""A urlopen() helper""" """A urlopen() helper"""
# someone passed a string, not a request # someone passed a string, not a request
if not isinstance(req, request.Request): if not isinstance(req, request.Request):
@ -38,7 +52,8 @@ class DoiProvider(ContentProvider):
doi = normalize_doi(doi) doi = normalize_doi(doi)
try: try:
resp = self.urlopen("https://doi.org/{}".format(doi)) resp = self._request("https://doi.org/{}".format(doi))
resp.raise_for_status()
# If the DOI doesn't resolve, just return URL # If the DOI doesn't resolve, just return URL
except HTTPError: except HTTPError:
return doi return doi
@ -53,7 +68,11 @@ class DoiProvider(ContentProvider):
file_url = deep_get(file_ref, host["download"]) file_url = deep_get(file_ref, host["download"])
fname = deep_get(file_ref, host["filename"]) fname = deep_get(file_ref, host["filename"])
logging.debug("Downloading file {} as {}\n".format(file_url, fname)) logging.debug("Downloading file {} as {}\n".format(file_url, fname))
with self.urlopen(file_url) as src:
yield "Requesting {}\n".format(file_url)
resp = self._request(file_url, stream=True)
resp.raise_for_status()
if path.dirname(fname): if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname)) sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir): if not path.exists(sub_dir):
@ -63,9 +82,9 @@ class DoiProvider(ContentProvider):
dst_fname = path.join(output_dir, fname) dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst: with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname) yield "Fetching {}\n".format(fname)
shutil.copyfileobj(src, dst) for chunk in resp.iter_content(chunk_size=None):
# first close the newly written file, then continue dst.write(chunk)
# processing it
if unzip and is_zipfile(dst_fname): if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname) yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname) zfile = ZipFile(dst_fname)

Wyświetl plik

@ -25,6 +25,7 @@ class Figshare(DoiProvider):
""" """
def __init__(self): def __init__(self):
super().__init__()
self.hosts = [ self.hosts = [
{ {
"hostname": [ "hostname": [
@ -74,13 +75,12 @@ class Figshare(DoiProvider):
yield "Fetching Figshare article {} in version {}.\n".format( yield "Fetching Figshare article {} in version {}.\n".format(
article_id, article_version article_id, article_version
) )
req = Request( resp = self.urlopen(
"{}{}/versions/{}".format(host["api"], article_id, article_version), "{}{}/versions/{}".format(host["api"], article_id, article_version),
headers={"accept": "application/json"}, headers={"accept": "application/json"},
) )
resp = self.urlopen(req)
article = json.loads(resp.read().decode("utf-8")) article = resp.json()
files = deep_get(article, host["filepath"]) files = deep_get(article, host["filepath"])
# only fetch files where is_link_only: False # only fetch files where is_link_only: False

Wyświetl plik

@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):
def _fetch_version(self, host): def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch""" """Fetch resource modified date and convert to epoch"""
json_response = json.loads( json_response = self.urlopen(host["version"].format(self.resource_id)).json()
self.urlopen(host["version"].format(self.resource_id)).read()
)
date = next( date = next(
item for item in json_response["dates"] if item["type"] == "modified" item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"] )["start_date"]

Wyświetl plik

@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
"""Provide contents of a Zenodo deposit.""" """Provide contents of a Zenodo deposit."""
def __init__(self): def __init__(self):
super().__init__()
# We need the hostname (url where records are), api url (for metadata), # We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in # filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata) # metadata), download (path to file download URL), and type (path to item type in metadata)
@ -55,13 +56,12 @@ class Zenodo(DoiProvider):
host = spec["host"] host = spec["host"]
yield "Fetching Zenodo record {}.\n".format(record_id) yield "Fetching Zenodo record {}.\n".format(record_id)
req = Request( resp = self.urlopen(
"{}{}".format(host["api"], record_id), "{}{}".format(host["api"], record_id),
headers={"accept": "application/json"}, headers={"accept": "application/json"},
) )
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8")) record = resp.json()
is_software = deep_get(record, host["type"]).lower() == "software" is_software = deep_get(record, host["type"]).lower() == "software"
files = deep_get(record, host["filepath"]) files = deep_get(record, host["filepath"])

Wyświetl plik

@ -52,6 +52,7 @@ setup(
"python-json-logger", "python-json-logger",
"escapism", "escapism",
"jinja2", "jinja2",
"requests",
"ruamel.yaml>=0.15", "ruamel.yaml>=0.15",
"toml", "toml",
"semver", "semver",

Wyświetl plik

@ -1,6 +1,7 @@
import json import json
import os import os
import pytest import pytest
import re
from io import BytesIO from io import BytesIO
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
@ -19,7 +20,7 @@ test_hosts = [
"doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT", "10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458", "https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016", "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
], ],
[ [
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
@ -27,56 +28,67 @@ test_hosts = [
], ],
) )
] ]
test_responses = { doi_responses = {
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": ( "https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
"https://dataverse.harvard.edu/file.xhtml" "https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
), ),
"doi:10.7910/DVN/6ZXAGT": ( "https://doi.org/10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml" "https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT" "?persistentId=doi:10.7910/DVN/6ZXAGT"
), ),
"10.7910/DVN/6ZXAGT": ( "https://dataverse.harvard.edu/api/access/datafile/3323458": (
"https://dataverse.harvard.edu/dataset.xhtml" "https://dataverse.harvard.edu/api/access/datafile/3323458"
"?persistentId=doi:10.7910/DVN/6ZXAGT" ),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
), ),
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
}
test_search = {
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
} }
@pytest.mark.parametrize("test_input, expected", test_hosts) @pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected): def test_detect_dataverse(test_input, expected, requests_mock):
def doi_resolver(url): def doi_resolver(req, context):
return test_responses.get(url) resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object( requests_mock.get(re.compile("https://"), json=doi_resolver)
Dataverse, "doi2url", side_effect=doi_resolver requests_mock.get(
) as fake_doi2url: "https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() json={
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
},
)
assert requests_mock.call_count == 0
# valid Dataverse DOIs trigger this content provider # valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0] assert Dataverse().detect(test_input[0]) == expected[0]
assert fake_doi2url.call_count == 2 # File, then dataset # 4: doi resolution (302), File, doi resolution (302), then dataset
assert Dataverse().detect(test_input[1]) == expected[0] assert requests_mock.call_count == 4
assert Dataverse().detect(test_input[2]) == expected[0] requests_mock.reset_mock()
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 1 assert Dataverse().detect(test_input[1]) == expected[0]
assert Dataverse().detect(test_input[3]) == expected[1] # 2: doi (302), dataset
assert requests_mock.call_count == 2
requests_mock.reset_mock()
assert Dataverse().detect(test_input[2]) == expected[0]
# 1: datafile (search dataverse for the file id)
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert Dataverse().detect(test_input[3]) == expected[1]
requests_mock.reset_mock()
with patch.object(Dataverse, "urlopen") as fake_urlopen:
# Don't trigger the Dataverse content provider # Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse # don't handle DOIs that aren't from Dataverse
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@ -95,11 +107,8 @@ def dv_files(tmpdir):
return [f1, f2, f3] return [f1, f2, f3]
def test_dataverse_fetch(dv_files): def test_dataverse_fetch(dv_files, requests_mock):
mock_response_ds_query = BytesIO( mock_response = {
json.dumps(
{
"data": {
"latestVersion": { "latestVersion": {
"files": [ "files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"}, {"dataFile": {"id": 1}, "label": "some-file.txt"},
@ -116,26 +125,29 @@ def test_dataverse_fetch(dv_files):
] ]
} }
} }
}
).encode("utf-8")
)
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context):
file_no = int(req.url.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()
requests_mock.get(
"https://dataverse.harvard.edu/api/datasets/"
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
json=mock_response,
)
requests_mock.get(
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
content=mock_filecontent,
)
dv = Dataverse() dv = Dataverse()
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response_ds_query
else:
file_no = int(req.split("/")[-1]) - 1
return urlopen("file://{}".format(dv_files[file_no]))
with patch.object(Dataverse, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
output = [] output = []
for l in dv.fetch(spec, d): for l in dv.fetch(spec, d):
output.append(l) output.append(l)
unpacked_files = set(os.listdir(d)) unpacked_files = set(os.listdir(d))
expected = set(["directory", "some-file.txt"]) expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files assert expected == unpacked_files

Wyświetl plik

@ -11,6 +11,7 @@ from zipfile import ZipFile
from repo2docker.contentproviders.doi import DoiProvider from repo2docker.contentproviders.doi import DoiProvider
from repo2docker.contentproviders.base import ContentProviderException from repo2docker.contentproviders.base import ContentProviderException
from repo2docker import __version__
def test_content_id(): def test_content_id():
@ -18,20 +19,15 @@ def test_content_id():
assert doi.content_id is None assert doi.content_id is None
def fake_urlopen(req): def test_url_headers(requests_mock):
print(req) requests_mock.get("https://mybinder.org", text="resp")
return req.headers
@patch("urllib.request.urlopen", fake_urlopen)
def test_url_headers():
doi = DoiProvider() doi = DoiProvider()
headers = {"test1": "value1", "Test2": "value2"} headers = {"test1": "value1", "Test2": "value2"}
result = doi.urlopen("https://mybinder.org", headers=headers) result = doi.urlopen("https://mybinder.org", headers=headers)
assert "Test1" in result assert "test1" in result.request.headers
assert "Test2" in result assert "Test2" in result.request.headers
assert len(result) is 3 # User-agent is also set assert result.request.headers["User-Agent"] == "repo2docker {}".format(__version__)
def test_unresolving_doi(): def test_unresolving_doi():

Wyświetl plik

@ -22,9 +22,14 @@ test_content_ids = [
@pytest.mark.parametrize("link,expected", test_content_ids) @pytest.mark.parametrize("link,expected", test_content_ids)
def test_content_id(link, expected): def test_content_id(link, expected, requests_mock):
with patch.object(Figshare, "urlopen") as fake_urlopen: def mocked_get(req, context):
fake_urlopen.return_value.url = link if req.url.startswith("https://doi.org"):
context.status_code = 302
context.headers["Location"] = link
return link
requests_mock.get(re.compile("https://"), text=mocked_get)
fig = Figshare() fig = Figshare()
fig.detect("10.6084/m9.figshare.9782777") fig.detect("10.6084/m9.figshare.9782777")
assert fig.content_id == expected assert fig.content_id == expected
@ -103,12 +108,10 @@ def figshare_archive(prefix="a_directory"):
yield zfile.name yield zfile.name
def test_fetch_zip(): def test_fetch_zip(requests_mock):
# see test_zenodo.py/test_fetch_software # see test_zenodo.py/test_fetch_software
with figshare_archive() as fig_path: with figshare_archive() as fig_path:
mock_response = BytesIO( mock_response = {
json.dumps(
{
"files": [ "files": [
{ {
"name": "afake.zip", "name": "afake.zip",
@ -117,16 +120,15 @@ def test_fetch_zip():
} }
] ]
} }
).encode("utf-8") requests_mock.get(
"https://api.figshare.com/v2/articles/123456/versions/42",
json=mock_response,
)
requests_mock.get(
"file://{}".format(fig_path), content=open(fig_path, "rb").read()
) )
def mock_urlopen(self, req): # with patch.object(Figshare, "urlopen", new=mock_urlopen):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Figshare, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
output = [] output = []
for l in test_fig.fetch(test_spec, d): for l in test_fig.fetch(test_spec, d):
@ -137,12 +139,10 @@ def test_fetch_zip():
assert expected == unpacked_files assert expected == unpacked_files
def test_fetch_data(): def test_fetch_data(requests_mock):
with figshare_archive() as a_path: with figshare_archive() as a_path:
with figshare_archive() as b_path: with figshare_archive() as b_path:
mock_response = BytesIO( mock_response = {
json.dumps(
{
"files": [ "files": [
{ {
"name": "afake.file", "name": "afake.file",
@ -157,16 +157,18 @@ def test_fetch_data():
{"name": "cfake.link", "is_link_only": True}, {"name": "cfake.link", "is_link_only": True},
] ]
} }
).encode("utf-8")
requests_mock.get(
"https://api.figshare.com/v2/articles/123456/versions/42",
json=mock_response,
)
requests_mock.get(
"file://{}".format(a_path), content=open(a_path, "rb").read()
)
requests_mock.get(
"file://{}".format(b_path), content=open(b_path, "rb").read()
) )
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Figshare, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
output = [] output = []
for l in test_fig.fetch(test_spec, d): for l in test_fig.fetch(test_spec, d):

Wyświetl plik

@ -5,37 +5,51 @@ from contextlib import contextmanager
from tempfile import TemporaryDirectory, NamedTemporaryFile from tempfile import TemporaryDirectory, NamedTemporaryFile
from unittest.mock import patch from unittest.mock import patch
from zipfile import ZipFile from zipfile import ZipFile
import re
from repo2docker.contentproviders import Hydroshare from repo2docker.contentproviders import Hydroshare
from repo2docker.contentproviders.base import ContentProviderException from repo2docker.contentproviders.base import ContentProviderException
def test_content_id(): doi_responses = {
with patch.object(Hydroshare, "urlopen") as fake_urlopen: "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": (
fake_urlopen.return_value.url = (
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
) ),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
hydroshare_data = {
"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]
}
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
hydro = Hydroshare() hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
def test_detect_hydroshare(): def test_detect_hydroshare(requests_mock):
with patch.object(Hydroshare, "urlopen") as fake_urlopen: requests_mock.get(re.compile("https://"), json=hydroshare_data)
fake_urlopen.return_value.url = ( requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
)
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read
# valid Hydroshare DOIs trigger this content provider # valid Hydroshare DOIs trigger this content provider
expected = { expected = {
"host": { "host": {
@ -49,6 +63,7 @@ def test_detect_hydroshare():
"resource": "b8f6eae9d89241cf8b5904033460af61", "resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757", "version": "1569427757",
} }
assert ( assert (
Hydroshare().detect( Hydroshare().detect(
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
@ -56,35 +71,31 @@ def test_detect_hydroshare():
== expected == expected
) )
# assert a call to urlopen was called to fetch version # assert a call to urlopen was called to fetch version
assert fake_urlopen.call_count == 1 assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert ( assert (
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
== expected
) )
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version # assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
assert fake_urlopen.call_count == 3 assert requests_mock.call_count == 3
requests_mock.reset_mock()
assert ( assert (
Hydroshare().detect( Hydroshare().detect(
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
) )
== expected == expected
) )
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version # assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
assert fake_urlopen.call_count == 5 assert requests_mock.call_count == 3
requests_mock.reset_mock()
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
# Don't trigger the Hydroshare content provider # Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None assert Hydroshare().detect("/some/path/here") is None
assert Hydroshare().detect("https://example.com/path/here") is None assert Hydroshare().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Hydroshare # don't handle DOIs that aren't from Hydroshare
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None

Wyświetl plik

@ -1,6 +1,7 @@
import json import json
import os import os
import pytest import pytest
import re
from contextlib import contextmanager from contextlib import contextmanager
from io import BytesIO from io import BytesIO
@ -11,12 +12,28 @@ from zipfile import ZipFile
from repo2docker.contentproviders import Zenodo from repo2docker.contentproviders import Zenodo
doi_responses = {
"https://doi.org/10.5281/zenodo.3232985": ("https://zenodo.org/record/3232985"),
"https://doi.org/10.22002/d1.1235": ("https://data.caltech.edu/records/1235"),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=doi_resolver)
def test_content_id():
with patch.object(Zenodo, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
zen = Zenodo() zen = Zenodo()
zen.detect("10.5281/zenodo.3232985") zen.detect("10.5281/zenodo.3232985")
assert zen.content_id == "3232985" assert zen.content_id == "3232985"
@ -43,24 +60,21 @@ test_hosts = [
@pytest.mark.parametrize("test_input,expected", test_hosts) @pytest.mark.parametrize("test_input,expected", test_hosts)
def test_detect_zenodo(test_input, expected): def test_detect_zenodo(test_input, expected, requests_mock):
with patch.object(Zenodo, "urlopen") as fake_urlopen: requests_mock.get(re.compile("https://"), json=doi_resolver)
fake_urlopen.return_value.url = test_input[0]
# valid Zenodo DOIs trigger this content provider # valid Zenodo DOIs trigger this content provider
assert Zenodo().detect(test_input[0]) == expected assert Zenodo().detect(test_input[0]) == expected
assert Zenodo().detect(test_input[1]) == expected assert Zenodo().detect(test_input[1]) == expected
assert Zenodo().detect(test_input[2]) == expected assert Zenodo().detect(test_input[2]) == expected
# only two of the three calls above have to resolve a DOI # only two of the three calls above have to resolve a DOI (2 req per doi resolution)
assert fake_urlopen.call_count == 2 assert requests_mock.call_count == 4
requests_mock.reset_mock()
with patch.object(Zenodo, "urlopen") as fake_urlopen:
# Don't trigger the Zenodo content provider # Don't trigger the Zenodo content provider
assert Zenodo().detect("/some/path/here") is None assert Zenodo().detect("/some/path/here") is None
assert Zenodo().detect("https://example.com/path/here") is None assert Zenodo().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Zenodo # don't handle DOIs that aren't from Zenodo
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
@ -74,13 +88,11 @@ def zenodo_archive(prefix="a_directory"):
yield zfile.name yield zfile.name
def test_fetch_software_from_github_archive(): def test_fetch_software_from_github_archive(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo record created from a # we "fetch" a local ZIP file to simulate a Zenodo record created from a
# GitHub repository via the Zenodo-GitHub integration # GitHub repository via the Zenodo-GitHub integration
with zenodo_archive() as zen_path: with zenodo_archive() as zen_path:
mock_response = BytesIO( mock_response = {
json.dumps(
{
"files": [ "files": [
{ {
"filename": "some_dir/afake.zip", "filename": "some_dir/afake.zip",
@ -89,16 +101,11 @@ def test_fetch_software_from_github_archive():
], ],
"metadata": {"upload_type": "software"}, "metadata": {"upload_type": "software"},
} }
).encode("utf-8") requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
) )
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
zen = Zenodo() zen = Zenodo()
spec = {"host": test_zen.hosts[0], "record": "1234"} spec = {"host": test_zen.hosts[0], "record": "1234"}
@ -112,13 +119,11 @@ def test_fetch_software_from_github_archive():
assert expected == unpacked_files assert expected == unpacked_files
def test_fetch_software(): def test_fetch_software(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo software record with a # we "fetch" a local ZIP file to simulate a Zenodo software record with a
# ZIP file in it # ZIP file in it
with zenodo_archive() as zen_path: with zenodo_archive() as zen_path:
mock_response = BytesIO( mock_response = {
json.dumps(
{
"files": [ "files": [
{ {
# this is the difference to the GitHub generated one, # this is the difference to the GitHub generated one,
@ -129,16 +134,11 @@ def test_fetch_software():
], ],
"metadata": {"upload_type": "software"}, "metadata": {"upload_type": "software"},
} }
).encode("utf-8") requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(zen_path), content=open(zen_path, "rb").read()
) )
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
zen = Zenodo() zen = Zenodo()
spec = spec = {"host": test_zen.hosts[0], "record": "1234"} spec = spec = {"host": test_zen.hosts[0], "record": "1234"}
@ -151,13 +151,11 @@ def test_fetch_software():
assert expected == unpacked_files assert expected == unpacked_files
def test_fetch_data(): def test_fetch_data(requests_mock):
# we "fetch" a local ZIP file to simulate a Zenodo data record # we "fetch" a local ZIP file to simulate a Zenodo data record
with zenodo_archive() as a_zen_path: with zenodo_archive() as a_zen_path:
with zenodo_archive() as b_zen_path: with zenodo_archive() as b_zen_path:
mock_response = BytesIO( mock_response = {
json.dumps(
{
"files": [ "files": [
{ {
"filename": "afake.zip", "filename": "afake.zip",
@ -170,16 +168,14 @@ def test_fetch_data():
], ],
"metadata": {"upload_type": "data"}, "metadata": {"upload_type": "data"},
} }
).encode("utf-8") requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response)
requests_mock.get(
"file://{}".format(a_zen_path), content=open(a_zen_path, "rb").read()
)
requests_mock.get(
"file://{}".format(b_zen_path), content=open(b_zen_path, "rb").read()
) )
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response
else:
return urlopen(req)
with patch.object(Zenodo, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
zen = Zenodo() zen = Zenodo()
spec = {"host": test_zen.hosts[0], "record": "1234"} spec = {"host": test_zen.hosts[0], "record": "1234"}