Sol Lee 2024-04-19 01:05:16 +00:00 zatwierdzone przez GitHub
commit f1248f6c14
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
5 zmienionych plików z 211 dodań i 1 usunięć

Wyświetl plik

@ -39,7 +39,8 @@ where ``<source-repository>`` is:
* a URL of a Git repository (``https://github.com/binder-examples/requirements``), * a URL of a Git repository (``https://github.com/binder-examples/requirements``),
* a Zenodo DOI (``10.5281/zenodo.1211089``), * a Zenodo DOI (``10.5281/zenodo.1211089``),
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``),
* a URL of a CKAN_ dataset (``https://demo.ckan.org/dataset/sample-dataset-1``), or
* a path to a local directory (``a/local/directory``) * a path to a local directory (``a/local/directory``)
of the source repository you want to build. of the source repository you want to build.
@ -136,3 +137,4 @@ Command line API
.. _Pytudes: https://github.com/norvig/pytudes .. _Pytudes: https://github.com/norvig/pytudes
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html .. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
.. _CKAN: https://ckan.org

Wyświetl plik

@ -152,6 +152,7 @@ class Repo2Docker(Application):
contentproviders.Dataverse, contentproviders.Dataverse,
contentproviders.Hydroshare, contentproviders.Hydroshare,
contentproviders.Swhid, contentproviders.Swhid,
contentproviders.CKAN,
contentproviders.Mercurial, contentproviders.Mercurial,
contentproviders.Git, contentproviders.Git,
], ],

Wyświetl plik

@ -1,4 +1,5 @@
from .base import Local from .base import Local
from .ckan import CKAN
from .dataverse import Dataverse from .dataverse import Dataverse
from .figshare import Figshare from .figshare import Figshare
from .git import Git from .git import Git

Wyświetl plik

@ -0,0 +1,127 @@
from datetime import datetime, timedelta, timezone
from os import path
from urllib.parse import parse_qs, urlparse
from requests import Session
from .. import __version__
from .base import ContentProvider
class CKAN(ContentProvider):
"""Provide contents of a remote CKAN dataset."""
def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": f"repo2docker {__version__}",
}
)
def _fetch_version(self, api_url):
"""Fetch dataset modified date and convert to epoch.
Borrowed from the Hydroshare provider.
"""
package_show_url = f"{api_url}package_show?id={self.dataset_id}"
resp = self.urlopen(package_show_url).json()
date = resp["result"]["metadata_modified"]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
# truncate the timestamp
return str(int(epoch))
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)
urlopen = _request
def detect(self, source, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a CKAN dataset."""
parsed_url = urlparse(source)
if not parsed_url.netloc:
return None
url_parts_1 = parsed_url.path.split("/history/")
url_parts_2 = url_parts_1[0].split("/")
if url_parts_2[-2] == "dataset":
self.dataset_id = url_parts_2[-1]
else:
return None
api_url_path = "/api/3/action/"
api_url = parsed_url._replace(
path="/".join(url_parts_2[:-2]) + api_url_path, query=""
).geturl()
status_show_url = f"{api_url}status_show"
resp = self.urlopen(status_show_url)
if resp.status_code == 200:
# handle the activites
activity_id = None
if parse_qs(parsed_url.query).get("activity_id") is not None:
activity_id = parse_qs(parsed_url.query).get("activity_id")[0]
if len(url_parts_1) == 2:
activity_id = url_parts_1[-1]
self.version = self._fetch_version(api_url)
return {
"dataset_id": self.dataset_id,
"activity_id": activity_id,
"api_url": api_url,
"version": self.version,
}
else:
return None
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch a CKAN dataset."""
dataset_id = spec["dataset_id"]
activity_id = spec["activity_id"]
yield f"Fetching CKAN dataset {dataset_id}.\n"
# handle the activites
if activity_id:
fetch_url = (
f"{spec['api_url']}activity_data_show?"
f"id={activity_id}&object_type=package"
)
else:
fetch_url = f"{spec['api_url']}package_show?id={dataset_id}"
resp = self.urlopen(
fetch_url,
headers={"accept": "application/json"},
)
dataset = resp.json()
yield "Fetching CKAN resources.\n"
resources = dataset["result"]["resources"]
for resource in resources:
file_url = resource["url"]
if file_url == "":
continue
fname = file_url.rsplit("/", maxsplit=1)[-1]
if fname == "":
fname = resource["id"]
yield f"Requesting {file_url}\n"
resp = self._request(file_url, stream=True)
resp.raise_for_status()
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield f"Fetching {fname}\n"
for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)
@property
def content_id(self):
"""A unique ID to represent the version of the content."""
return f"{self.dataset_id}.v{self.version}"

Wyświetl plik

@ -0,0 +1,79 @@
import os
from contextlib import contextmanager
from tempfile import NamedTemporaryFile, TemporaryDirectory
from repo2docker.contentproviders import CKAN
def test_detect_ckan(requests_mock):
mock_response = {"result": {"metadata_modified": "2024-02-27T14:15:54.573058"}}
requests_mock.get("http://demo.ckan.org/api/3/action/status_show", status_code=200)
requests_mock.get(
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
)
expected = {
"dataset_id": "1234",
"activity_id": None,
"api_url": "http://demo.ckan.org/api/3/action/",
"version": "1709043354",
}
expected_activity = expected.copy()
expected_activity["activity_id"] = "5678"
assert CKAN().detect("http://demo.ckan.org/dataset/1234") == expected
assert (
CKAN().detect("http://demo.ckan.org/dataset/1234?activity_id=5678")
== expected_activity
)
assert (
CKAN().detect("http://demo.ckan.org/dataset/1234/history/5678")
== expected_activity
)
def test_detect_not_ckan():
# Don't trigger the CKAN content provider
assert CKAN().detect("/some/path/here") is None
assert CKAN().detect("https://example.com/path/here") is None
assert CKAN().detect("https://data.gov.tw/dataset/6564") is None
@contextmanager
def ckan_file():
with NamedTemporaryFile() as file:
file.write(b"some content")
yield file.name
def test_ckan_fetch(requests_mock):
with ckan_file() as ckan_path:
mock_response = {"result": {"resources": [{"url": f"file://{ckan_path}"}]}}
requests_mock.get(
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
)
requests_mock.get(
"http://demo.ckan.org/api/3/action/activity_data_show?id=5678",
json=mock_response,
)
requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read())
ckan = CKAN()
spec = {"dataset_id": "1234", "api_url": "http://demo.ckan.org/api/3/action/"}
expected = {ckan_path.rsplit("/", maxsplit=1)[1]}
with TemporaryDirectory() as d:
spec["activity_id"] = None
output = []
for l in ckan.fetch(spec, d):
output.append(l)
assert expected == set(os.listdir(d))
with TemporaryDirectory() as d:
spec["activity_id"] = "5678"
output = []
for l in ckan.fetch(spec, d):
output.append(l)
assert expected == set(os.listdir(d))