kopia lustrzana https://github.com/onthegomap/planetiler
Update old wikidata items (#971)
rodzic
240edee34e
commit
8512b7729c
|
@ -40,6 +40,7 @@ import java.io.IOException;
|
|||
import java.nio.file.FileSystem;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -111,6 +112,8 @@ public class Planetiler {
|
|||
private boolean useWikidata = false;
|
||||
private boolean onlyFetchWikidata = false;
|
||||
private boolean fetchWikidata = false;
|
||||
private Duration wikidataMaxAge = Duration.ZERO;
|
||||
private int wikidataUpdateLimit = 0;
|
||||
private final boolean fetchOsmTileStats;
|
||||
private TileArchiveMetadata tileArchiveMetadata;
|
||||
|
||||
|
@ -573,6 +576,11 @@ public class Planetiler {
|
|||
fetchWikidata);
|
||||
useWikidata = fetchWikidata || arguments.getBoolean("use_wikidata", "use wikidata translations", true);
|
||||
wikidataNamesFile = arguments.file("wikidata_cache", "wikidata cache file", defaultWikidataCache);
|
||||
wikidataMaxAge =
|
||||
arguments.getDuration("wikidata_max_age",
|
||||
"Maximum age of Wikidata translations (in ISO-8601 duration format PnDTnHnMn.nS; 0S = disabled)", "0s");
|
||||
wikidataUpdateLimit = arguments.getInteger("wikidata_update_limit",
|
||||
"Limit on how many old translations to update during one download (0 = disabled)", 0);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -793,7 +801,8 @@ public class Planetiler {
|
|||
ensureInputFilesExist();
|
||||
|
||||
if (fetchWikidata) {
|
||||
Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats());
|
||||
Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats(), wikidataMaxAge,
|
||||
wikidataUpdateLimit);
|
||||
}
|
||||
if (useWikidata) {
|
||||
translations().addFallbackTranslationProvider(Wikidata.load(wikidataNamesFile));
|
||||
|
|
|
@ -473,7 +473,10 @@ public class Arguments {
|
|||
*/
|
||||
public Duration getDuration(String key, String description, String defaultValue) {
|
||||
String value = getArg(key, defaultValue);
|
||||
Duration parsed = Duration.parse("PT" + value);
|
||||
if (!value.startsWith("P") && !value.startsWith("T") && !value.startsWith("-")) {
|
||||
value = "PT" + value;
|
||||
}
|
||||
Duration parsed = Duration.parse(value);
|
||||
logArgValue(key, description, parsed.get(ChronoUnit.SECONDS) + " seconds");
|
||||
return parsed;
|
||||
}
|
||||
|
|
|
@ -37,6 +37,9 @@ import java.net.http.HttpResponse.BodyHandlers;
|
|||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Clock;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -111,11 +114,23 @@ public class Wikidata {
|
|||
* @throws UncheckedIOException if an error occurs
|
||||
*/
|
||||
public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats) {
|
||||
fetch(infile, outfile, config, profile, stats, Duration.ofSeconds(0), 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads any existing translations from {@code outfile}, then downloads translations for any wikidata element in
|
||||
* {@code infile} that have not already been downloaded and writes the results to {@code outfile}.
|
||||
*
|
||||
* @throws UncheckedIOException if an error occurs
|
||||
*/
|
||||
public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats,
|
||||
Duration maxAge, int updateLimit) {
|
||||
|
||||
var timer = stats.startStage("wikidata");
|
||||
int processThreads = Math.max(1, config.threads() - 1);
|
||||
LOGGER.info("Starting with " + processThreads + " process threads");
|
||||
|
||||
WikidataTranslations oldMappings = load(outfile);
|
||||
WikidataTranslations oldMappings = load(outfile, maxAge, updateLimit);
|
||||
try (
|
||||
Writer writer = Files.newBufferedWriter(outfile);
|
||||
OsmBlockSource osmSource = infile.get()
|
||||
|
@ -163,13 +178,19 @@ public class Wikidata {
|
|||
* Returns translations parsed from {@code path} that was written by a previous run of the downloader.
|
||||
*/
|
||||
public static WikidataTranslations load(Path path) {
|
||||
var translationsProvider = Wikidata.load(path, Duration.ZERO, 0);
|
||||
translationsProvider.clearUpdateTimes();
|
||||
return translationsProvider;
|
||||
}
|
||||
|
||||
private static WikidataTranslations load(Path path, Duration maxAge, int updateLimit) {
|
||||
Timer timer = Timer.start();
|
||||
if (!Files.exists(path)) {
|
||||
LOGGER.info("no wikidata translations found, run with --fetch-wikidata to download");
|
||||
return new WikidataTranslations();
|
||||
} else {
|
||||
try (BufferedReader fis = Files.newBufferedReader(path)) {
|
||||
WikidataTranslations result = load(fis);
|
||||
WikidataTranslations result = load(fis, maxAge, updateLimit, Clock.systemUTC());
|
||||
LOGGER.info(
|
||||
"loaded from " + result.getAll().size() + " mappings from " + path.toAbsolutePath() + " in " + timer.stop());
|
||||
return result;
|
||||
|
@ -185,14 +206,37 @@ public class Wikidata {
|
|||
* second element is a map from language to translation.
|
||||
*/
|
||||
static WikidataTranslations load(BufferedReader reader) throws IOException {
|
||||
return load(reader, Duration.ZERO, 0, Clock.systemUTC());
|
||||
}
|
||||
|
||||
protected static WikidataTranslations load(BufferedReader reader, Duration maxAge, int updateLimit, Clock clock)
|
||||
throws IOException {
|
||||
WikidataTranslations mappings = new WikidataTranslations();
|
||||
String line;
|
||||
Instant updateTimeLimit = maxAge.isZero() ? null : Instant.now(clock).minus(maxAge);
|
||||
int updateCounter = 0;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
JsonNode node = objectMapper.readTree(line);
|
||||
long id = Long.parseLong(node.get(0).asText());
|
||||
|
||||
Instant updateTime = Instant.EPOCH;
|
||||
if (node.has(2)) {
|
||||
updateTime = Instant.ofEpochMilli(node.get(2).asLong());
|
||||
}
|
||||
if (updateTimeLimit != null && updateTime.isBefore(updateTimeLimit) &&
|
||||
(updateLimit <= 0 || updateCounter < updateLimit)) {
|
||||
// do not load old entries => new translations will be fetched later
|
||||
updateCounter++;
|
||||
continue;
|
||||
}
|
||||
mappings.putUpdateTime(id, updateTime);
|
||||
|
||||
ObjectNode theseMappings = (ObjectNode) node.get(1);
|
||||
theseMappings.fields().forEachRemaining(entry -> mappings.put(id, entry.getKey(), entry.getValue().asText()));
|
||||
}
|
||||
if (updateCounter > 0) {
|
||||
LOGGER.info("{} translations dropped as too old, will be re-fetched", updateCounter);
|
||||
}
|
||||
return mappings;
|
||||
}
|
||||
|
||||
|
@ -255,7 +299,7 @@ public class Wikidata {
|
|||
LongObjectMap<Map<String, String>> results = queryWikidata(qidsToFetch);
|
||||
batches.inc();
|
||||
LOGGER.info("Fetched batch {} ({} qids) {}", batches.get(), qidsToFetch.size(), timer.stop());
|
||||
writeTranslations(results);
|
||||
writeTranslations(results, Hppc.newLongObjectHashMap());
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throwFatalException(e);
|
||||
|
@ -330,9 +374,10 @@ public class Wikidata {
|
|||
|
||||
void loadExisting(WikidataTranslations oldMappings) throws IOException {
|
||||
LongObjectMap<Map<String, String>> alreadyHave = oldMappings.getAll();
|
||||
LongObjectMap<Instant> alreadyHaveUpdateTimes = oldMappings.getUpdateTimes();
|
||||
if (!alreadyHave.isEmpty()) {
|
||||
LOGGER.info("skipping " + alreadyHave.size() + " mappings we already have");
|
||||
writeTranslations(alreadyHave);
|
||||
writeTranslations(alreadyHave, alreadyHaveUpdateTimes);
|
||||
for (LongObjectCursor<Map<String, String>> cursor : alreadyHave) {
|
||||
visited.add(cursor.key);
|
||||
}
|
||||
|
@ -340,11 +385,16 @@ public class Wikidata {
|
|||
}
|
||||
|
||||
/** Flushes a batch of translations to disk. */
|
||||
private void writeTranslations(LongObjectMap<Map<String, String>> results) throws IOException {
|
||||
private void writeTranslations(LongObjectMap<Map<String, String>> results, LongObjectMap<Instant> updateTimes)
|
||||
throws IOException {
|
||||
final long updateTimeDefault = Instant.now().toEpochMilli();
|
||||
for (LongObjectCursor<Map<String, String>> cursor : results) {
|
||||
long updateTime =
|
||||
updateTimes.containsKey(cursor.key) ? updateTimes.get(cursor.key).toEpochMilli() : updateTimeDefault;
|
||||
writer.write(objectMapper.writeValueAsString(List.of(
|
||||
Long.toString(cursor.key),
|
||||
cursor.value
|
||||
cursor.value,
|
||||
updateTime
|
||||
)));
|
||||
writer.write(System.lineSeparator());
|
||||
}
|
||||
|
@ -383,6 +433,7 @@ public class Wikidata {
|
|||
public static class WikidataTranslations implements Translations.TranslationProvider {
|
||||
|
||||
private final LongObjectMap<Map<String, String>> data = Hppc.newLongObjectHashMap();
|
||||
private final LongObjectMap<Instant> updateTimes = Hppc.newLongObjectHashMap();
|
||||
|
||||
public WikidataTranslations() {}
|
||||
|
||||
|
@ -391,11 +442,25 @@ public class Wikidata {
|
|||
return data.get(qid);
|
||||
}
|
||||
|
||||
public void clearUpdateTimes() {
|
||||
updateTimes.clear();
|
||||
}
|
||||
|
||||
/** Returns all maps from language code to translated name for {@code qid}. */
|
||||
public LongObjectMap<Map<String, String>> getAll() {
|
||||
return data;
|
||||
}
|
||||
|
||||
/** Returns all maps from language code to translated name for {@code qid}. */
|
||||
public LongObjectMap<Instant> getUpdateTimes() {
|
||||
return updateTimes;
|
||||
}
|
||||
|
||||
/** Stores a update date+time for {@code qid}. */
|
||||
public void putUpdateTime(long qid, Instant updateTime) {
|
||||
updateTimes.put(qid, updateTime);
|
||||
}
|
||||
|
||||
/** Stores a name translation for {@code qid} in {@code lang}. */
|
||||
public void put(long qid, String lang, String value) {
|
||||
Map<String, String> map = data.get(qid);
|
||||
|
|
|
@ -19,6 +19,10 @@ import java.net.http.HttpResponse.BodySubscriber;
|
|||
import java.net.http.HttpResponse.BodySubscribers;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Clock;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.time.ZoneId;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -75,6 +79,17 @@ class WikidataTest {
|
|||
}
|
||||
}
|
||||
""";
|
||||
final String wikidataNamesLegacyJson = """
|
||||
["1",{"en":"English 1","de":"Deutch 1"}]
|
||||
["2",{"en":"English 2","de":"Deutch 2"}]
|
||||
["3",{"en":"English 3","de":"Deutch 3"}]
|
||||
""";
|
||||
final Clock clock = Clock.fixed(Instant.ofEpochMilli(60_000), ZoneId.of("UTC"));
|
||||
final String wikidataNamesJson = """
|
||||
["1",{"en":"English 1","de":"Deutch 1"},55000]
|
||||
["2",{"en":"English 2","de":"Deutch 2"},30000]
|
||||
["3",{"en":"English 3","de":"Deutch 3"},30000]
|
||||
""";
|
||||
|
||||
@Test
|
||||
void testWikidataTranslations() {
|
||||
|
@ -161,6 +176,33 @@ class WikidataTest {
|
|||
assertInstanceOf(IOException.class, innerException);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testLegacyWikidataNamesJson() throws IOException {
|
||||
var reader = new BufferedReader(new StringReader(wikidataNamesLegacyJson));
|
||||
// no timestamp + age limit set => all old => all should be dropped
|
||||
var translationsProvider = Wikidata.load(reader, Duration.ofSeconds(1), 0, clock);
|
||||
assertEquals(0, translationsProvider.getAll().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWikidataNamesJsonMaxAge() throws IOException {
|
||||
// 10s => item 1 is 5s old hence fresh, the rest is 30s old hence outdated
|
||||
Duration maxAge = Duration.ofSeconds(10);
|
||||
|
||||
var reader = new BufferedReader(new StringReader(wikidataNamesJson));
|
||||
var translationsProvider = Wikidata.load(reader, maxAge, 0, clock);
|
||||
assertEquals(1, translationsProvider.getAll().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWikidataNamesJsonUpdateLimit() throws IOException {
|
||||
Duration maxAge = Duration.ofSeconds(1);
|
||||
|
||||
var reader = new BufferedReader(new StringReader(wikidataNamesJson));
|
||||
var translationsProvider = Wikidata.load(reader, maxAge, 1, clock);
|
||||
assertEquals(2, translationsProvider.getAll().size());
|
||||
}
|
||||
|
||||
private static void assertEqualsIgnoringWhitespace(String expected, String actual) {
|
||||
assertEquals(ignoreWhitespace(expected), ignoreWhitespace(actual));
|
||||
}
|
||||
|
|
Ładowanie…
Reference in New Issue