diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 8795a222..e89eb8ff 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -39,7 +39,8 @@ where ```` is: * a URL of a Git repository (``https://github.com/binder-examples/requirements``), * a Zenodo DOI (``10.5281/zenodo.1211089``), - * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or + * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), + * a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or * a path to a local directory (``a/local/directory``) of the source repository you want to build. @@ -136,3 +137,4 @@ Command line API .. _Pytudes: https://github.com/norvig/pytudes .. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html +.. _CKAN: https://ckan.org diff --git a/repo2docker/app.py b/repo2docker/app.py index c7f3ab81..41e831e2 100755 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -152,6 +152,7 @@ class Repo2Docker(Application): contentproviders.Dataverse, contentproviders.Hydroshare, contentproviders.Swhid, + contentproviders.CKAN, contentproviders.Mercurial, contentproviders.Git, ], diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py index 5c40476d..a0a2e019 100755 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -1,4 +1,5 @@ from .base import Local +from .ckan import CKAN from .dataverse import Dataverse from .figshare import Figshare from .git import Git diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py new file mode 100644 index 00000000..2a1de448 --- /dev/null +++ b/repo2docker/contentproviders/ckan.py @@ -0,0 +1,101 @@ +import re +from datetime import datetime, timedelta, timezone +from os import path +from urllib.parse import urlparse + +from requests import Session + +from .. import __version__ +from .base import ContentProvider + + +class CKAN(ContentProvider): + """Provide contents of a remote CKAN dataset.""" + + def __init__(self): + super().__init__() + self.session = Session() + self.session.headers.update( + { + "user-agent": f"repo2docker {__version__}", + } + ) + + def _fetch_version(self, api_url): + """Fetch dataset modified date and convert to epoch. + Borrowed from the Hydroshare provider. + """ + package_show_url = f"{api_url}package_show?id={self.dataset_id}" + resp = self.urlopen(package_show_url).json() + date = resp["result"]["metadata_modified"] + parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") + epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() + # truncate the timestamp + return str(int(epoch)) + + def _request(self, url, **kwargs): + return self.session.get(url, **kwargs) + + urlopen = _request + url_regex = r"/dataset/[a-z0-9_\\-]*$" + + def detect(self, source, ref=None, extra_args=None): + """Trigger this provider for things that resolve to a CKAN dataset.""" + parsed_url = urlparse(source) + if not parsed_url.netloc: + return None + + api_url = parsed_url._replace( + path=re.sub(self.url_regex, "/api/3/action/", parsed_url.path) + ).geturl() + + status_show_url = f"{api_url}status_show" + resp = self.urlopen(status_show_url) + if resp.status_code == 200: + self.dataset_id = parsed_url.path.rsplit("/", maxsplit=1)[1] + self.version = self._fetch_version(api_url) + return { + "dataset_id": self.dataset_id, + "api_url": api_url, + "version": self.version, + } + else: + return None + + def fetch(self, spec, output_dir, yield_output=False): + """Fetch a CKAN dataset.""" + dataset_id = spec["dataset_id"] + + yield f"Fetching CKAN dataset {dataset_id}.\n" + package_show_url = f"{spec['api_url']}package_show?id={dataset_id}" + resp = self.urlopen( + package_show_url, + headers={"accept": "application/json"}, + ) + + dataset = resp.json() + + yield "Fetching CKAN resources.\n" + + resources = dataset["result"]["resources"] + + for resource in resources: + file_url = resource["url"] + fname = file_url.rsplit("/", maxsplit=1)[-1] + if fname == "": + fname = resource["id"] + + yield f"Requesting {file_url}\n" + resp = self._request(file_url, stream=True) + resp.raise_for_status() + + dst_fname = path.join(output_dir, fname) + with open(dst_fname, "wb") as dst: + yield f"Fetching {fname}\n" + for chunk in resp.iter_content(chunk_size=None): + dst.write(chunk) + + @property + def content_id(self): + """A unique ID to represent the version of the content.""" + return f"{self.dataset_id}.v{self.version}" diff --git a/tests/unit/contentproviders/test_ckan.py b/tests/unit/contentproviders/test_ckan.py new file mode 100644 index 00000000..ec65162b --- /dev/null +++ b/tests/unit/contentproviders/test_ckan.py @@ -0,0 +1,58 @@ +import os +from contextlib import contextmanager +from tempfile import NamedTemporaryFile, TemporaryDirectory + +import pytest + +from repo2docker.contentproviders import CKAN + +test_ckan = CKAN() +test_hosts = [ + ( + [ + "http://demo.ckan.org/dataset/sample-dataset-1", + ], + { + "dataset_id": "sample-dataset-1", + "api_url": "http://demo.ckan.org/api/3/action/", + "version": "1707387710", + }, + ) +] + + +@pytest.mark.parametrize("test_input, expected", test_hosts) +def test_detect_ckan(test_input, expected): + assert CKAN().detect(test_input[0]) == expected + + # Don't trigger the CKAN content provider + assert CKAN().detect("/some/path/here") is None + assert CKAN().detect("https://example.com/path/here") is None + assert CKAN().detect("https://data.gov.tw/dataset/6564") is None + + +@contextmanager +def ckan_file(): + with NamedTemporaryFile() as file: + file.write(b"some content") + yield file.name + + +def test_ckan_fetch(requests_mock): + with ckan_file() as ckan_path: + mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}} + requests_mock.get( + "http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response + ) + requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read()) + with TemporaryDirectory() as d: + ckan = CKAN() + spec = { + "dataset_id": "1234", + "api_url": "http://demo.ckan.org/api/3/action/", + } + output = [] + for l in ckan.fetch(spec, d): + output.append(l) + expected = {ckan_path.rsplit("/", maxsplit=1)[1]} + assert expected == set(os.listdir(d))