From 8512b7729c8f8d91f9952c14b7bd2f19ec211c61 Mon Sep 17 00:00:00 2001 From: Peter Hanecak <115141505+phanecak-maptiler@users.noreply.github.com> Date: Sat, 10 Aug 2024 11:17:13 +0200 Subject: [PATCH] Update old wikidata items (#971) --- .../com/onthegomap/planetiler/Planetiler.java | 11 ++- .../planetiler/config/Arguments.java | 5 +- .../onthegomap/planetiler/util/Wikidata.java | 77 +++++++++++++++++-- .../planetiler/util/WikidataTest.java | 42 ++++++++++ 4 files changed, 127 insertions(+), 8 deletions(-) diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java index e0fb0946..d87fe774 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java @@ -40,6 +40,7 @@ import java.io.IOException; import java.nio.file.FileSystem; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -111,6 +112,8 @@ public class Planetiler { private boolean useWikidata = false; private boolean onlyFetchWikidata = false; private boolean fetchWikidata = false; + private Duration wikidataMaxAge = Duration.ZERO; + private int wikidataUpdateLimit = 0; private final boolean fetchOsmTileStats; private TileArchiveMetadata tileArchiveMetadata; @@ -573,6 +576,11 @@ public class Planetiler { fetchWikidata); useWikidata = fetchWikidata || arguments.getBoolean("use_wikidata", "use wikidata translations", true); wikidataNamesFile = arguments.file("wikidata_cache", "wikidata cache file", defaultWikidataCache); + wikidataMaxAge = + arguments.getDuration("wikidata_max_age", + "Maximum age of Wikidata translations (in ISO-8601 duration format PnDTnHnMn.nS; 0S = disabled)", "0s"); + wikidataUpdateLimit = arguments.getInteger("wikidata_update_limit", + "Limit on how many old translations to update during one download (0 = disabled)", 0); return this; } @@ -793,7 +801,8 @@ public class Planetiler { ensureInputFilesExist(); if (fetchWikidata) { - Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats()); + Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats(), wikidataMaxAge, + wikidataUpdateLimit); } if (useWikidata) { translations().addFallbackTranslationProvider(Wikidata.load(wikidataNamesFile)); diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/config/Arguments.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/config/Arguments.java index 69553344..d4e92d99 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/config/Arguments.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/config/Arguments.java @@ -473,7 +473,10 @@ public class Arguments { */ public Duration getDuration(String key, String description, String defaultValue) { String value = getArg(key, defaultValue); - Duration parsed = Duration.parse("PT" + value); + if (!value.startsWith("P") && !value.startsWith("T") && !value.startsWith("-")) { + value = "PT" + value; + } + Duration parsed = Duration.parse(value); logArgValue(key, description, parsed.get(ChronoUnit.SECONDS) + " seconds"); return parsed; } diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Wikidata.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Wikidata.java index 398713b5..18ae737f 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Wikidata.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Wikidata.java @@ -37,6 +37,9 @@ import java.net.http.HttpResponse.BodyHandlers; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Clock; +import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -111,11 +114,23 @@ public class Wikidata { * @throws UncheckedIOException if an error occurs */ public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats) { + fetch(infile, outfile, config, profile, stats, Duration.ofSeconds(0), 0); + } + + /** + * Loads any existing translations from {@code outfile}, then downloads translations for any wikidata element in + * {@code infile} that have not already been downloaded and writes the results to {@code outfile}. + * + * @throws UncheckedIOException if an error occurs + */ + public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats, + Duration maxAge, int updateLimit) { + var timer = stats.startStage("wikidata"); int processThreads = Math.max(1, config.threads() - 1); LOGGER.info("Starting with " + processThreads + " process threads"); - WikidataTranslations oldMappings = load(outfile); + WikidataTranslations oldMappings = load(outfile, maxAge, updateLimit); try ( Writer writer = Files.newBufferedWriter(outfile); OsmBlockSource osmSource = infile.get() @@ -163,13 +178,19 @@ public class Wikidata { * Returns translations parsed from {@code path} that was written by a previous run of the downloader. */ public static WikidataTranslations load(Path path) { + var translationsProvider = Wikidata.load(path, Duration.ZERO, 0); + translationsProvider.clearUpdateTimes(); + return translationsProvider; + } + + private static WikidataTranslations load(Path path, Duration maxAge, int updateLimit) { Timer timer = Timer.start(); if (!Files.exists(path)) { LOGGER.info("no wikidata translations found, run with --fetch-wikidata to download"); return new WikidataTranslations(); } else { try (BufferedReader fis = Files.newBufferedReader(path)) { - WikidataTranslations result = load(fis); + WikidataTranslations result = load(fis, maxAge, updateLimit, Clock.systemUTC()); LOGGER.info( "loaded from " + result.getAll().size() + " mappings from " + path.toAbsolutePath() + " in " + timer.stop()); return result; @@ -185,14 +206,37 @@ public class Wikidata { * second element is a map from language to translation. */ static WikidataTranslations load(BufferedReader reader) throws IOException { + return load(reader, Duration.ZERO, 0, Clock.systemUTC()); + } + + protected static WikidataTranslations load(BufferedReader reader, Duration maxAge, int updateLimit, Clock clock) + throws IOException { WikidataTranslations mappings = new WikidataTranslations(); String line; + Instant updateTimeLimit = maxAge.isZero() ? null : Instant.now(clock).minus(maxAge); + int updateCounter = 0; while ((line = reader.readLine()) != null) { JsonNode node = objectMapper.readTree(line); long id = Long.parseLong(node.get(0).asText()); + + Instant updateTime = Instant.EPOCH; + if (node.has(2)) { + updateTime = Instant.ofEpochMilli(node.get(2).asLong()); + } + if (updateTimeLimit != null && updateTime.isBefore(updateTimeLimit) && + (updateLimit <= 0 || updateCounter < updateLimit)) { + // do not load old entries => new translations will be fetched later + updateCounter++; + continue; + } + mappings.putUpdateTime(id, updateTime); + ObjectNode theseMappings = (ObjectNode) node.get(1); theseMappings.fields().forEachRemaining(entry -> mappings.put(id, entry.getKey(), entry.getValue().asText())); } + if (updateCounter > 0) { + LOGGER.info("{} translations dropped as too old, will be re-fetched", updateCounter); + } return mappings; } @@ -255,7 +299,7 @@ public class Wikidata { LongObjectMap> results = queryWikidata(qidsToFetch); batches.inc(); LOGGER.info("Fetched batch {} ({} qids) {}", batches.get(), qidsToFetch.size(), timer.stop()); - writeTranslations(results); + writeTranslations(results, Hppc.newLongObjectHashMap()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throwFatalException(e); @@ -330,9 +374,10 @@ public class Wikidata { void loadExisting(WikidataTranslations oldMappings) throws IOException { LongObjectMap> alreadyHave = oldMappings.getAll(); + LongObjectMap alreadyHaveUpdateTimes = oldMappings.getUpdateTimes(); if (!alreadyHave.isEmpty()) { LOGGER.info("skipping " + alreadyHave.size() + " mappings we already have"); - writeTranslations(alreadyHave); + writeTranslations(alreadyHave, alreadyHaveUpdateTimes); for (LongObjectCursor> cursor : alreadyHave) { visited.add(cursor.key); } @@ -340,11 +385,16 @@ public class Wikidata { } /** Flushes a batch of translations to disk. */ - private void writeTranslations(LongObjectMap> results) throws IOException { + private void writeTranslations(LongObjectMap> results, LongObjectMap updateTimes) + throws IOException { + final long updateTimeDefault = Instant.now().toEpochMilli(); for (LongObjectCursor> cursor : results) { + long updateTime = + updateTimes.containsKey(cursor.key) ? updateTimes.get(cursor.key).toEpochMilli() : updateTimeDefault; writer.write(objectMapper.writeValueAsString(List.of( Long.toString(cursor.key), - cursor.value + cursor.value, + updateTime ))); writer.write(System.lineSeparator()); } @@ -383,6 +433,7 @@ public class Wikidata { public static class WikidataTranslations implements Translations.TranslationProvider { private final LongObjectMap> data = Hppc.newLongObjectHashMap(); + private final LongObjectMap updateTimes = Hppc.newLongObjectHashMap(); public WikidataTranslations() {} @@ -391,11 +442,25 @@ public class Wikidata { return data.get(qid); } + public void clearUpdateTimes() { + updateTimes.clear(); + } + /** Returns all maps from language code to translated name for {@code qid}. */ public LongObjectMap> getAll() { return data; } + /** Returns all maps from language code to translated name for {@code qid}. */ + public LongObjectMap getUpdateTimes() { + return updateTimes; + } + + /** Stores a update date+time for {@code qid}. */ + public void putUpdateTime(long qid, Instant updateTime) { + updateTimes.put(qid, updateTime); + } + /** Stores a name translation for {@code qid} in {@code lang}. */ public void put(long qid, String lang, String value) { Map map = data.get(qid); diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/WikidataTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/WikidataTest.java index f4ca92bd..bec46b96 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/WikidataTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/WikidataTest.java @@ -19,6 +19,10 @@ import java.net.http.HttpResponse.BodySubscriber; import java.net.http.HttpResponse.BodySubscribers; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.time.Clock; +import java.time.Duration; +import java.time.Instant; +import java.time.ZoneId; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -75,6 +79,17 @@ class WikidataTest { } } """; + final String wikidataNamesLegacyJson = """ + ["1",{"en":"English 1","de":"Deutch 1"}] + ["2",{"en":"English 2","de":"Deutch 2"}] + ["3",{"en":"English 3","de":"Deutch 3"}] + """; + final Clock clock = Clock.fixed(Instant.ofEpochMilli(60_000), ZoneId.of("UTC")); + final String wikidataNamesJson = """ + ["1",{"en":"English 1","de":"Deutch 1"},55000] + ["2",{"en":"English 2","de":"Deutch 2"},30000] + ["3",{"en":"English 3","de":"Deutch 3"},30000] + """; @Test void testWikidataTranslations() { @@ -161,6 +176,33 @@ class WikidataTest { assertInstanceOf(IOException.class, innerException); } + @Test + void testLegacyWikidataNamesJson() throws IOException { + var reader = new BufferedReader(new StringReader(wikidataNamesLegacyJson)); + // no timestamp + age limit set => all old => all should be dropped + var translationsProvider = Wikidata.load(reader, Duration.ofSeconds(1), 0, clock); + assertEquals(0, translationsProvider.getAll().size()); + } + + @Test + void testWikidataNamesJsonMaxAge() throws IOException { + // 10s => item 1 is 5s old hence fresh, the rest is 30s old hence outdated + Duration maxAge = Duration.ofSeconds(10); + + var reader = new BufferedReader(new StringReader(wikidataNamesJson)); + var translationsProvider = Wikidata.load(reader, maxAge, 0, clock); + assertEquals(1, translationsProvider.getAll().size()); + } + + @Test + void testWikidataNamesJsonUpdateLimit() throws IOException { + Duration maxAge = Duration.ofSeconds(1); + + var reader = new BufferedReader(new StringReader(wikidataNamesJson)); + var translationsProvider = Wikidata.load(reader, maxAge, 1, clock); + assertEquals(2, translationsProvider.getAll().size()); + } + private static void assertEqualsIgnoringWhitespace(String expected, String actual) { assertEquals(ignoreWhitespace(expected), ignoreWhitespace(actual)); }