kopia lustrzana https://github.com/onthegomap/planetiler
Update old wikidata items (#971)
rodzic
240edee34e
commit
8512b7729c
|
@ -40,6 +40,7 @@ import java.io.IOException;
|
||||||
import java.nio.file.FileSystem;
|
import java.nio.file.FileSystem;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -111,6 +112,8 @@ public class Planetiler {
|
||||||
private boolean useWikidata = false;
|
private boolean useWikidata = false;
|
||||||
private boolean onlyFetchWikidata = false;
|
private boolean onlyFetchWikidata = false;
|
||||||
private boolean fetchWikidata = false;
|
private boolean fetchWikidata = false;
|
||||||
|
private Duration wikidataMaxAge = Duration.ZERO;
|
||||||
|
private int wikidataUpdateLimit = 0;
|
||||||
private final boolean fetchOsmTileStats;
|
private final boolean fetchOsmTileStats;
|
||||||
private TileArchiveMetadata tileArchiveMetadata;
|
private TileArchiveMetadata tileArchiveMetadata;
|
||||||
|
|
||||||
|
@ -573,6 +576,11 @@ public class Planetiler {
|
||||||
fetchWikidata);
|
fetchWikidata);
|
||||||
useWikidata = fetchWikidata || arguments.getBoolean("use_wikidata", "use wikidata translations", true);
|
useWikidata = fetchWikidata || arguments.getBoolean("use_wikidata", "use wikidata translations", true);
|
||||||
wikidataNamesFile = arguments.file("wikidata_cache", "wikidata cache file", defaultWikidataCache);
|
wikidataNamesFile = arguments.file("wikidata_cache", "wikidata cache file", defaultWikidataCache);
|
||||||
|
wikidataMaxAge =
|
||||||
|
arguments.getDuration("wikidata_max_age",
|
||||||
|
"Maximum age of Wikidata translations (in ISO-8601 duration format PnDTnHnMn.nS; 0S = disabled)", "0s");
|
||||||
|
wikidataUpdateLimit = arguments.getInteger("wikidata_update_limit",
|
||||||
|
"Limit on how many old translations to update during one download (0 = disabled)", 0);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -793,7 +801,8 @@ public class Planetiler {
|
||||||
ensureInputFilesExist();
|
ensureInputFilesExist();
|
||||||
|
|
||||||
if (fetchWikidata) {
|
if (fetchWikidata) {
|
||||||
Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats());
|
Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats(), wikidataMaxAge,
|
||||||
|
wikidataUpdateLimit);
|
||||||
}
|
}
|
||||||
if (useWikidata) {
|
if (useWikidata) {
|
||||||
translations().addFallbackTranslationProvider(Wikidata.load(wikidataNamesFile));
|
translations().addFallbackTranslationProvider(Wikidata.load(wikidataNamesFile));
|
||||||
|
|
|
@ -473,7 +473,10 @@ public class Arguments {
|
||||||
*/
|
*/
|
||||||
public Duration getDuration(String key, String description, String defaultValue) {
|
public Duration getDuration(String key, String description, String defaultValue) {
|
||||||
String value = getArg(key, defaultValue);
|
String value = getArg(key, defaultValue);
|
||||||
Duration parsed = Duration.parse("PT" + value);
|
if (!value.startsWith("P") && !value.startsWith("T") && !value.startsWith("-")) {
|
||||||
|
value = "PT" + value;
|
||||||
|
}
|
||||||
|
Duration parsed = Duration.parse(value);
|
||||||
logArgValue(key, description, parsed.get(ChronoUnit.SECONDS) + " seconds");
|
logArgValue(key, description, parsed.get(ChronoUnit.SECONDS) + " seconds");
|
||||||
return parsed;
|
return parsed;
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,9 @@ import java.net.http.HttpResponse.BodyHandlers;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Clock;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -111,11 +114,23 @@ public class Wikidata {
|
||||||
* @throws UncheckedIOException if an error occurs
|
* @throws UncheckedIOException if an error occurs
|
||||||
*/
|
*/
|
||||||
public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats) {
|
public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats) {
|
||||||
|
fetch(infile, outfile, config, profile, stats, Duration.ofSeconds(0), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads any existing translations from {@code outfile}, then downloads translations for any wikidata element in
|
||||||
|
* {@code infile} that have not already been downloaded and writes the results to {@code outfile}.
|
||||||
|
*
|
||||||
|
* @throws UncheckedIOException if an error occurs
|
||||||
|
*/
|
||||||
|
public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats,
|
||||||
|
Duration maxAge, int updateLimit) {
|
||||||
|
|
||||||
var timer = stats.startStage("wikidata");
|
var timer = stats.startStage("wikidata");
|
||||||
int processThreads = Math.max(1, config.threads() - 1);
|
int processThreads = Math.max(1, config.threads() - 1);
|
||||||
LOGGER.info("Starting with " + processThreads + " process threads");
|
LOGGER.info("Starting with " + processThreads + " process threads");
|
||||||
|
|
||||||
WikidataTranslations oldMappings = load(outfile);
|
WikidataTranslations oldMappings = load(outfile, maxAge, updateLimit);
|
||||||
try (
|
try (
|
||||||
Writer writer = Files.newBufferedWriter(outfile);
|
Writer writer = Files.newBufferedWriter(outfile);
|
||||||
OsmBlockSource osmSource = infile.get()
|
OsmBlockSource osmSource = infile.get()
|
||||||
|
@ -163,13 +178,19 @@ public class Wikidata {
|
||||||
* Returns translations parsed from {@code path} that was written by a previous run of the downloader.
|
* Returns translations parsed from {@code path} that was written by a previous run of the downloader.
|
||||||
*/
|
*/
|
||||||
public static WikidataTranslations load(Path path) {
|
public static WikidataTranslations load(Path path) {
|
||||||
|
var translationsProvider = Wikidata.load(path, Duration.ZERO, 0);
|
||||||
|
translationsProvider.clearUpdateTimes();
|
||||||
|
return translationsProvider;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WikidataTranslations load(Path path, Duration maxAge, int updateLimit) {
|
||||||
Timer timer = Timer.start();
|
Timer timer = Timer.start();
|
||||||
if (!Files.exists(path)) {
|
if (!Files.exists(path)) {
|
||||||
LOGGER.info("no wikidata translations found, run with --fetch-wikidata to download");
|
LOGGER.info("no wikidata translations found, run with --fetch-wikidata to download");
|
||||||
return new WikidataTranslations();
|
return new WikidataTranslations();
|
||||||
} else {
|
} else {
|
||||||
try (BufferedReader fis = Files.newBufferedReader(path)) {
|
try (BufferedReader fis = Files.newBufferedReader(path)) {
|
||||||
WikidataTranslations result = load(fis);
|
WikidataTranslations result = load(fis, maxAge, updateLimit, Clock.systemUTC());
|
||||||
LOGGER.info(
|
LOGGER.info(
|
||||||
"loaded from " + result.getAll().size() + " mappings from " + path.toAbsolutePath() + " in " + timer.stop());
|
"loaded from " + result.getAll().size() + " mappings from " + path.toAbsolutePath() + " in " + timer.stop());
|
||||||
return result;
|
return result;
|
||||||
|
@ -185,14 +206,37 @@ public class Wikidata {
|
||||||
* second element is a map from language to translation.
|
* second element is a map from language to translation.
|
||||||
*/
|
*/
|
||||||
static WikidataTranslations load(BufferedReader reader) throws IOException {
|
static WikidataTranslations load(BufferedReader reader) throws IOException {
|
||||||
|
return load(reader, Duration.ZERO, 0, Clock.systemUTC());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static WikidataTranslations load(BufferedReader reader, Duration maxAge, int updateLimit, Clock clock)
|
||||||
|
throws IOException {
|
||||||
WikidataTranslations mappings = new WikidataTranslations();
|
WikidataTranslations mappings = new WikidataTranslations();
|
||||||
String line;
|
String line;
|
||||||
|
Instant updateTimeLimit = maxAge.isZero() ? null : Instant.now(clock).minus(maxAge);
|
||||||
|
int updateCounter = 0;
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
JsonNode node = objectMapper.readTree(line);
|
JsonNode node = objectMapper.readTree(line);
|
||||||
long id = Long.parseLong(node.get(0).asText());
|
long id = Long.parseLong(node.get(0).asText());
|
||||||
|
|
||||||
|
Instant updateTime = Instant.EPOCH;
|
||||||
|
if (node.has(2)) {
|
||||||
|
updateTime = Instant.ofEpochMilli(node.get(2).asLong());
|
||||||
|
}
|
||||||
|
if (updateTimeLimit != null && updateTime.isBefore(updateTimeLimit) &&
|
||||||
|
(updateLimit <= 0 || updateCounter < updateLimit)) {
|
||||||
|
// do not load old entries => new translations will be fetched later
|
||||||
|
updateCounter++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
mappings.putUpdateTime(id, updateTime);
|
||||||
|
|
||||||
ObjectNode theseMappings = (ObjectNode) node.get(1);
|
ObjectNode theseMappings = (ObjectNode) node.get(1);
|
||||||
theseMappings.fields().forEachRemaining(entry -> mappings.put(id, entry.getKey(), entry.getValue().asText()));
|
theseMappings.fields().forEachRemaining(entry -> mappings.put(id, entry.getKey(), entry.getValue().asText()));
|
||||||
}
|
}
|
||||||
|
if (updateCounter > 0) {
|
||||||
|
LOGGER.info("{} translations dropped as too old, will be re-fetched", updateCounter);
|
||||||
|
}
|
||||||
return mappings;
|
return mappings;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -255,7 +299,7 @@ public class Wikidata {
|
||||||
LongObjectMap<Map<String, String>> results = queryWikidata(qidsToFetch);
|
LongObjectMap<Map<String, String>> results = queryWikidata(qidsToFetch);
|
||||||
batches.inc();
|
batches.inc();
|
||||||
LOGGER.info("Fetched batch {} ({} qids) {}", batches.get(), qidsToFetch.size(), timer.stop());
|
LOGGER.info("Fetched batch {} ({} qids) {}", batches.get(), qidsToFetch.size(), timer.stop());
|
||||||
writeTranslations(results);
|
writeTranslations(results, Hppc.newLongObjectHashMap());
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
throwFatalException(e);
|
throwFatalException(e);
|
||||||
|
@ -330,9 +374,10 @@ public class Wikidata {
|
||||||
|
|
||||||
void loadExisting(WikidataTranslations oldMappings) throws IOException {
|
void loadExisting(WikidataTranslations oldMappings) throws IOException {
|
||||||
LongObjectMap<Map<String, String>> alreadyHave = oldMappings.getAll();
|
LongObjectMap<Map<String, String>> alreadyHave = oldMappings.getAll();
|
||||||
|
LongObjectMap<Instant> alreadyHaveUpdateTimes = oldMappings.getUpdateTimes();
|
||||||
if (!alreadyHave.isEmpty()) {
|
if (!alreadyHave.isEmpty()) {
|
||||||
LOGGER.info("skipping " + alreadyHave.size() + " mappings we already have");
|
LOGGER.info("skipping " + alreadyHave.size() + " mappings we already have");
|
||||||
writeTranslations(alreadyHave);
|
writeTranslations(alreadyHave, alreadyHaveUpdateTimes);
|
||||||
for (LongObjectCursor<Map<String, String>> cursor : alreadyHave) {
|
for (LongObjectCursor<Map<String, String>> cursor : alreadyHave) {
|
||||||
visited.add(cursor.key);
|
visited.add(cursor.key);
|
||||||
}
|
}
|
||||||
|
@ -340,11 +385,16 @@ public class Wikidata {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Flushes a batch of translations to disk. */
|
/** Flushes a batch of translations to disk. */
|
||||||
private void writeTranslations(LongObjectMap<Map<String, String>> results) throws IOException {
|
private void writeTranslations(LongObjectMap<Map<String, String>> results, LongObjectMap<Instant> updateTimes)
|
||||||
|
throws IOException {
|
||||||
|
final long updateTimeDefault = Instant.now().toEpochMilli();
|
||||||
for (LongObjectCursor<Map<String, String>> cursor : results) {
|
for (LongObjectCursor<Map<String, String>> cursor : results) {
|
||||||
|
long updateTime =
|
||||||
|
updateTimes.containsKey(cursor.key) ? updateTimes.get(cursor.key).toEpochMilli() : updateTimeDefault;
|
||||||
writer.write(objectMapper.writeValueAsString(List.of(
|
writer.write(objectMapper.writeValueAsString(List.of(
|
||||||
Long.toString(cursor.key),
|
Long.toString(cursor.key),
|
||||||
cursor.value
|
cursor.value,
|
||||||
|
updateTime
|
||||||
)));
|
)));
|
||||||
writer.write(System.lineSeparator());
|
writer.write(System.lineSeparator());
|
||||||
}
|
}
|
||||||
|
@ -383,6 +433,7 @@ public class Wikidata {
|
||||||
public static class WikidataTranslations implements Translations.TranslationProvider {
|
public static class WikidataTranslations implements Translations.TranslationProvider {
|
||||||
|
|
||||||
private final LongObjectMap<Map<String, String>> data = Hppc.newLongObjectHashMap();
|
private final LongObjectMap<Map<String, String>> data = Hppc.newLongObjectHashMap();
|
||||||
|
private final LongObjectMap<Instant> updateTimes = Hppc.newLongObjectHashMap();
|
||||||
|
|
||||||
public WikidataTranslations() {}
|
public WikidataTranslations() {}
|
||||||
|
|
||||||
|
@ -391,11 +442,25 @@ public class Wikidata {
|
||||||
return data.get(qid);
|
return data.get(qid);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void clearUpdateTimes() {
|
||||||
|
updateTimes.clear();
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns all maps from language code to translated name for {@code qid}. */
|
/** Returns all maps from language code to translated name for {@code qid}. */
|
||||||
public LongObjectMap<Map<String, String>> getAll() {
|
public LongObjectMap<Map<String, String>> getAll() {
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns all maps from language code to translated name for {@code qid}. */
|
||||||
|
public LongObjectMap<Instant> getUpdateTimes() {
|
||||||
|
return updateTimes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Stores a update date+time for {@code qid}. */
|
||||||
|
public void putUpdateTime(long qid, Instant updateTime) {
|
||||||
|
updateTimes.put(qid, updateTime);
|
||||||
|
}
|
||||||
|
|
||||||
/** Stores a name translation for {@code qid} in {@code lang}. */
|
/** Stores a name translation for {@code qid} in {@code lang}. */
|
||||||
public void put(long qid, String lang, String value) {
|
public void put(long qid, String lang, String value) {
|
||||||
Map<String, String> map = data.get(qid);
|
Map<String, String> map = data.get(qid);
|
||||||
|
|
|
@ -19,6 +19,10 @@ import java.net.http.HttpResponse.BodySubscriber;
|
||||||
import java.net.http.HttpResponse.BodySubscribers;
|
import java.net.http.HttpResponse.BodySubscribers;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Clock;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.time.ZoneId;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -75,6 +79,17 @@ class WikidataTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
""";
|
""";
|
||||||
|
final String wikidataNamesLegacyJson = """
|
||||||
|
["1",{"en":"English 1","de":"Deutch 1"}]
|
||||||
|
["2",{"en":"English 2","de":"Deutch 2"}]
|
||||||
|
["3",{"en":"English 3","de":"Deutch 3"}]
|
||||||
|
""";
|
||||||
|
final Clock clock = Clock.fixed(Instant.ofEpochMilli(60_000), ZoneId.of("UTC"));
|
||||||
|
final String wikidataNamesJson = """
|
||||||
|
["1",{"en":"English 1","de":"Deutch 1"},55000]
|
||||||
|
["2",{"en":"English 2","de":"Deutch 2"},30000]
|
||||||
|
["3",{"en":"English 3","de":"Deutch 3"},30000]
|
||||||
|
""";
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testWikidataTranslations() {
|
void testWikidataTranslations() {
|
||||||
|
@ -161,6 +176,33 @@ class WikidataTest {
|
||||||
assertInstanceOf(IOException.class, innerException);
|
assertInstanceOf(IOException.class, innerException);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testLegacyWikidataNamesJson() throws IOException {
|
||||||
|
var reader = new BufferedReader(new StringReader(wikidataNamesLegacyJson));
|
||||||
|
// no timestamp + age limit set => all old => all should be dropped
|
||||||
|
var translationsProvider = Wikidata.load(reader, Duration.ofSeconds(1), 0, clock);
|
||||||
|
assertEquals(0, translationsProvider.getAll().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testWikidataNamesJsonMaxAge() throws IOException {
|
||||||
|
// 10s => item 1 is 5s old hence fresh, the rest is 30s old hence outdated
|
||||||
|
Duration maxAge = Duration.ofSeconds(10);
|
||||||
|
|
||||||
|
var reader = new BufferedReader(new StringReader(wikidataNamesJson));
|
||||||
|
var translationsProvider = Wikidata.load(reader, maxAge, 0, clock);
|
||||||
|
assertEquals(1, translationsProvider.getAll().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testWikidataNamesJsonUpdateLimit() throws IOException {
|
||||||
|
Duration maxAge = Duration.ofSeconds(1);
|
||||||
|
|
||||||
|
var reader = new BufferedReader(new StringReader(wikidataNamesJson));
|
||||||
|
var translationsProvider = Wikidata.load(reader, maxAge, 1, clock);
|
||||||
|
assertEquals(2, translationsProvider.getAll().size());
|
||||||
|
}
|
||||||
|
|
||||||
private static void assertEqualsIgnoringWhitespace(String expected, String actual) {
|
private static void assertEqualsIgnoringWhitespace(String expected, String actual) {
|
||||||
assertEquals(ignoreWhitespace(expected), ignoreWhitespace(actual));
|
assertEquals(ignoreWhitespace(expected), ignoreWhitespace(actual));
|
||||||
}
|
}
|
||||||
|
|
Ładowanie…
Reference in New Issue