Add CKAN content provider

pull/1336/head
Sol Lee 2024-02-20 06:37:21 +00:00
rodzic a20dd1cf97
commit 391b9bc5ba
5 zmienionych plików z 164 dodań i 1 usunięć

Wyświetl plik

@ -39,7 +39,8 @@ where ``<source-repository>`` is:
* a URL of a Git repository (``https://github.com/binder-examples/requirements``), * a URL of a Git repository (``https://github.com/binder-examples/requirements``),
* a Zenodo DOI (``10.5281/zenodo.1211089``), * a Zenodo DOI (``10.5281/zenodo.1211089``),
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``),
* a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or
* a path to a local directory (``a/local/directory``) * a path to a local directory (``a/local/directory``)
of the source repository you want to build. of the source repository you want to build.
@ -136,3 +137,4 @@ Command line API
.. _Pytudes: https://github.com/norvig/pytudes .. _Pytudes: https://github.com/norvig/pytudes
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html .. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
.. _CKAN: https://ckan.org

Wyświetl plik

@ -152,6 +152,7 @@ class Repo2Docker(Application):
contentproviders.Dataverse, contentproviders.Dataverse,
contentproviders.Hydroshare, contentproviders.Hydroshare,
contentproviders.Swhid, contentproviders.Swhid,
contentproviders.CKAN,
contentproviders.Mercurial, contentproviders.Mercurial,
contentproviders.Git, contentproviders.Git,
], ],

Wyświetl plik

@ -1,4 +1,5 @@
from .base import Local from .base import Local
from .ckan import CKAN
from .dataverse import Dataverse from .dataverse import Dataverse
from .figshare import Figshare from .figshare import Figshare
from .git import Git from .git import Git

Wyświetl plik

@ -0,0 +1,101 @@
import re
from datetime import datetime, timedelta, timezone
from os import path
from urllib.parse import urlparse
from requests import Session
from .. import __version__
from .base import ContentProvider
class CKAN(ContentProvider):
"""Provide contents of a remote CKAN dataset."""
def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": f"repo2docker {__version__}",
}
)
def _fetch_version(self, api_url):
"""Fetch dataset modified date and convert to epoch.
Borrowed from the Hydroshare provider.
"""
package_show_url = f"{api_url}package_show?id={self.dataset_id}"
resp = self.urlopen(package_show_url).json()
date = resp["result"]["metadata_modified"]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
# truncate the timestamp
return str(int(epoch))
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)
urlopen = _request
url_regex = r"/dataset/[a-z0-9_\\-]*$"
def detect(self, source, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a CKAN dataset."""
parsed_url = urlparse(source)
if not parsed_url.netloc:
return None
api_url = parsed_url._replace(
path=re.sub(self.url_regex, "/api/3/action/", parsed_url.path)
).geturl()
status_show_url = f"{api_url}status_show"
resp = self.urlopen(status_show_url)
if resp.status_code == 200:
self.dataset_id = parsed_url.path.rsplit("/", maxsplit=1)[1]
self.version = self._fetch_version(api_url)
return {
"dataset_id": self.dataset_id,
"api_url": api_url,
"version": self.version,
}
else:
return None
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch a CKAN dataset."""
dataset_id = spec["dataset_id"]
yield f"Fetching CKAN dataset {dataset_id}.\n"
package_show_url = f"{spec['api_url']}package_show?id={dataset_id}"
resp = self.urlopen(
package_show_url,
headers={"accept": "application/json"},
)
dataset = resp.json()
yield "Fetching CKAN resources.\n"
resources = dataset["result"]["resources"]
for resource in resources:
file_url = resource["url"]
fname = file_url.rsplit("/", maxsplit=1)[-1]
if fname == "":
fname = resource["id"]
yield f"Requesting {file_url}\n"
resp = self._request(file_url, stream=True)
resp.raise_for_status()
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield f"Fetching {fname}\n"
for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)
@property
def content_id(self):
"""A unique ID to represent the version of the content."""
return f"{self.dataset_id}.v{self.version}"

Wyświetl plik

@ -0,0 +1,58 @@
import os
from contextlib import contextmanager
from tempfile import NamedTemporaryFile, TemporaryDirectory
import pytest
from repo2docker.contentproviders import CKAN
test_ckan = CKAN()
test_hosts = [
(
[
"http://demo.ckan.org/dataset/sample-dataset-1",
],
{
"dataset_id": "sample-dataset-1",
"api_url": "http://demo.ckan.org/api/3/action/",
"version": "1707387710",
},
)
]
@pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_ckan(test_input, expected):
assert CKAN().detect(test_input[0]) == expected
# Don't trigger the CKAN content provider
assert CKAN().detect("/some/path/here") is None
assert CKAN().detect("https://example.com/path/here") is None
assert CKAN().detect("https://data.gov.tw/dataset/6564") is None
@contextmanager
def ckan_file():
with NamedTemporaryFile() as file:
file.write(b"some content")
yield file.name
def test_ckan_fetch(requests_mock):
with ckan_file() as ckan_path:
mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}}
requests_mock.get(
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
)
requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read())
with TemporaryDirectory() as d:
ckan = CKAN()
spec = {
"dataset_id": "1234",
"api_url": "http://demo.ckan.org/api/3/action/",
}
output = []
for l in ckan.fetch(spec, d):
output.append(l)
expected = {ckan_path.rsplit("/", maxsplit=1)[1]}
assert expected == set(os.listdir(d))