Porównaj commity

...

2 Commity

Autor SHA1 Wiadomość Data
Sol Lee c948d78013 Skip the resource with an empty link 2024-04-19 01:00:33 +00:00
Sol Lee 7faa53c83b Handle the activities 2024-04-18 09:12:54 +00:00
2 zmienionych plików z 59 dodań i 13 usunięć

Wyświetl plik

@ -1,6 +1,6 @@
from datetime import datetime, timedelta, timezone
from os import path
from urllib.parse import urlparse
from urllib.parse import parse_qs, urlparse
from requests import Session
@ -43,23 +43,33 @@ class CKAN(ContentProvider):
if not parsed_url.netloc:
return None
url_parts = parsed_url.path.split("/")
if url_parts[-2] == "dataset":
self.dataset_id = url_parts[-1]
url_parts_1 = parsed_url.path.split("/history/")
url_parts_2 = url_parts_1[0].split("/")
if url_parts_2[-2] == "dataset":
self.dataset_id = url_parts_2[-1]
else:
return None
api_url_path = "/api/3/action/"
api_url = parsed_url._replace(
path="/".join(url_parts[:-2]) + api_url_path
path="/".join(url_parts_2[:-2]) + api_url_path, query=""
).geturl()
status_show_url = f"{api_url}status_show"
resp = self.urlopen(status_show_url)
if resp.status_code == 200:
# handle the activites
activity_id = None
if parse_qs(parsed_url.query).get("activity_id") is not None:
activity_id = parse_qs(parsed_url.query).get("activity_id")[0]
if len(url_parts_1) == 2:
activity_id = url_parts_1[-1]
self.version = self._fetch_version(api_url)
return {
"dataset_id": self.dataset_id,
"activity_id": activity_id,
"api_url": api_url,
"version": self.version,
}
@ -69,11 +79,21 @@ class CKAN(ContentProvider):
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch a CKAN dataset."""
dataset_id = spec["dataset_id"]
activity_id = spec["activity_id"]
yield f"Fetching CKAN dataset {dataset_id}.\n"
package_show_url = f"{spec['api_url']}package_show?id={dataset_id}"
# handle the activites
if activity_id:
fetch_url = (
f"{spec['api_url']}activity_data_show?"
f"id={activity_id}&object_type=package"
)
else:
fetch_url = f"{spec['api_url']}package_show?id={dataset_id}"
resp = self.urlopen(
package_show_url,
fetch_url,
headers={"accept": "application/json"},
)
@ -85,6 +105,8 @@ class CKAN(ContentProvider):
for resource in resources:
file_url = resource["url"]
if file_url == "":
continue
fname = file_url.rsplit("/", maxsplit=1)[-1]
if fname == "":
fname = resource["id"]

Wyświetl plik

@ -14,11 +14,23 @@ def test_detect_ckan(requests_mock):
expected = {
"dataset_id": "1234",
"activity_id": None,
"api_url": "http://demo.ckan.org/api/3/action/",
"version": "1709043354",
}
expected_activity = expected.copy()
expected_activity["activity_id"] = "5678"
assert CKAN().detect("http://demo.ckan.org/dataset/1234") == expected
assert (
CKAN().detect("http://demo.ckan.org/dataset/1234?activity_id=5678")
== expected_activity
)
assert (
CKAN().detect("http://demo.ckan.org/dataset/1234/history/5678")
== expected_activity
)
def test_detect_not_ckan():
@ -41,15 +53,27 @@ def test_ckan_fetch(requests_mock):
requests_mock.get(
"http://demo.ckan.org/api/3/action/package_show?id=1234", json=mock_response
)
requests_mock.get(
"http://demo.ckan.org/api/3/action/activity_data_show?id=5678",
json=mock_response,
)
requests_mock.get(f"file://{ckan_path}", content=open(ckan_path, "rb").read())
ckan = CKAN()
spec = {"dataset_id": "1234", "api_url": "http://demo.ckan.org/api/3/action/"}
expected = {ckan_path.rsplit("/", maxsplit=1)[1]}
with TemporaryDirectory() as d:
ckan = CKAN()
spec = {
"dataset_id": "1234",
"api_url": "http://demo.ckan.org/api/3/action/",
}
spec["activity_id"] = None
output = []
for l in ckan.fetch(spec, d):
output.append(l)
assert expected == set(os.listdir(d))
with TemporaryDirectory() as d:
spec["activity_id"] = "5678"
output = []
for l in ckan.fetch(spec, d):
output.append(l)
expected = {ckan_path.rsplit("/", maxsplit=1)[1]}
assert expected == set(os.listdir(d))