Update old wikidata items (#971)

pull/978/head
Peter Hanecak 2024-08-10 11:17:13 +02:00 zatwierdzone przez GitHub
rodzic 240edee34e
commit 8512b7729c
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
4 zmienionych plików z 127 dodań i 8 usunięć

Wyświetl plik

@ -40,6 +40,7 @@ import java.io.IOException;
import java.nio.file.FileSystem;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@ -111,6 +112,8 @@ public class Planetiler {
private boolean useWikidata = false;
private boolean onlyFetchWikidata = false;
private boolean fetchWikidata = false;
private Duration wikidataMaxAge = Duration.ZERO;
private int wikidataUpdateLimit = 0;
private final boolean fetchOsmTileStats;
private TileArchiveMetadata tileArchiveMetadata;
@ -573,6 +576,11 @@ public class Planetiler {
fetchWikidata);
useWikidata = fetchWikidata || arguments.getBoolean("use_wikidata", "use wikidata translations", true);
wikidataNamesFile = arguments.file("wikidata_cache", "wikidata cache file", defaultWikidataCache);
wikidataMaxAge =
arguments.getDuration("wikidata_max_age",
"Maximum age of Wikidata translations (in ISO-8601 duration format PnDTnHnMn.nS; 0S = disabled)", "0s");
wikidataUpdateLimit = arguments.getInteger("wikidata_update_limit",
"Limit on how many old translations to update during one download (0 = disabled)", 0);
return this;
}
@ -793,7 +801,8 @@ public class Planetiler {
ensureInputFilesExist();
if (fetchWikidata) {
Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats());
Wikidata.fetch(osmInputFile(), wikidataNamesFile, config(), profile(), stats(), wikidataMaxAge,
wikidataUpdateLimit);
}
if (useWikidata) {
translations().addFallbackTranslationProvider(Wikidata.load(wikidataNamesFile));

Wyświetl plik

@ -473,7 +473,10 @@ public class Arguments {
*/
public Duration getDuration(String key, String description, String defaultValue) {
String value = getArg(key, defaultValue);
Duration parsed = Duration.parse("PT" + value);
if (!value.startsWith("P") && !value.startsWith("T") && !value.startsWith("-")) {
value = "PT" + value;
}
Duration parsed = Duration.parse(value);
logArgValue(key, description, parsed.get(ChronoUnit.SECONDS) + " seconds");
return parsed;
}

Wyświetl plik

@ -37,6 +37,9 @@ import java.net.http.HttpResponse.BodyHandlers;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@ -111,11 +114,23 @@ public class Wikidata {
* @throws UncheckedIOException if an error occurs
*/
public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats) {
fetch(infile, outfile, config, profile, stats, Duration.ofSeconds(0), 0);
}
/**
* Loads any existing translations from {@code outfile}, then downloads translations for any wikidata element in
* {@code infile} that have not already been downloaded and writes the results to {@code outfile}.
*
* @throws UncheckedIOException if an error occurs
*/
public static void fetch(OsmInputFile infile, Path outfile, PlanetilerConfig config, Profile profile, Stats stats,
Duration maxAge, int updateLimit) {
var timer = stats.startStage("wikidata");
int processThreads = Math.max(1, config.threads() - 1);
LOGGER.info("Starting with " + processThreads + " process threads");
WikidataTranslations oldMappings = load(outfile);
WikidataTranslations oldMappings = load(outfile, maxAge, updateLimit);
try (
Writer writer = Files.newBufferedWriter(outfile);
OsmBlockSource osmSource = infile.get()
@ -163,13 +178,19 @@ public class Wikidata {
* Returns translations parsed from {@code path} that was written by a previous run of the downloader.
*/
public static WikidataTranslations load(Path path) {
var translationsProvider = Wikidata.load(path, Duration.ZERO, 0);
translationsProvider.clearUpdateTimes();
return translationsProvider;
}
private static WikidataTranslations load(Path path, Duration maxAge, int updateLimit) {
Timer timer = Timer.start();
if (!Files.exists(path)) {
LOGGER.info("no wikidata translations found, run with --fetch-wikidata to download");
return new WikidataTranslations();
} else {
try (BufferedReader fis = Files.newBufferedReader(path)) {
WikidataTranslations result = load(fis);
WikidataTranslations result = load(fis, maxAge, updateLimit, Clock.systemUTC());
LOGGER.info(
"loaded from " + result.getAll().size() + " mappings from " + path.toAbsolutePath() + " in " + timer.stop());
return result;
@ -185,14 +206,37 @@ public class Wikidata {
* second element is a map from language to translation.
*/
static WikidataTranslations load(BufferedReader reader) throws IOException {
return load(reader, Duration.ZERO, 0, Clock.systemUTC());
}
protected static WikidataTranslations load(BufferedReader reader, Duration maxAge, int updateLimit, Clock clock)
throws IOException {
WikidataTranslations mappings = new WikidataTranslations();
String line;
Instant updateTimeLimit = maxAge.isZero() ? null : Instant.now(clock).minus(maxAge);
int updateCounter = 0;
while ((line = reader.readLine()) != null) {
JsonNode node = objectMapper.readTree(line);
long id = Long.parseLong(node.get(0).asText());
Instant updateTime = Instant.EPOCH;
if (node.has(2)) {
updateTime = Instant.ofEpochMilli(node.get(2).asLong());
}
if (updateTimeLimit != null && updateTime.isBefore(updateTimeLimit) &&
(updateLimit <= 0 || updateCounter < updateLimit)) {
// do not load old entries => new translations will be fetched later
updateCounter++;
continue;
}
mappings.putUpdateTime(id, updateTime);
ObjectNode theseMappings = (ObjectNode) node.get(1);
theseMappings.fields().forEachRemaining(entry -> mappings.put(id, entry.getKey(), entry.getValue().asText()));
}
if (updateCounter > 0) {
LOGGER.info("{} translations dropped as too old, will be re-fetched", updateCounter);
}
return mappings;
}
@ -255,7 +299,7 @@ public class Wikidata {
LongObjectMap<Map<String, String>> results = queryWikidata(qidsToFetch);
batches.inc();
LOGGER.info("Fetched batch {} ({} qids) {}", batches.get(), qidsToFetch.size(), timer.stop());
writeTranslations(results);
writeTranslations(results, Hppc.newLongObjectHashMap());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throwFatalException(e);
@ -330,9 +374,10 @@ public class Wikidata {
void loadExisting(WikidataTranslations oldMappings) throws IOException {
LongObjectMap<Map<String, String>> alreadyHave = oldMappings.getAll();
LongObjectMap<Instant> alreadyHaveUpdateTimes = oldMappings.getUpdateTimes();
if (!alreadyHave.isEmpty()) {
LOGGER.info("skipping " + alreadyHave.size() + " mappings we already have");
writeTranslations(alreadyHave);
writeTranslations(alreadyHave, alreadyHaveUpdateTimes);
for (LongObjectCursor<Map<String, String>> cursor : alreadyHave) {
visited.add(cursor.key);
}
@ -340,11 +385,16 @@ public class Wikidata {
}
/** Flushes a batch of translations to disk. */
private void writeTranslations(LongObjectMap<Map<String, String>> results) throws IOException {
private void writeTranslations(LongObjectMap<Map<String, String>> results, LongObjectMap<Instant> updateTimes)
throws IOException {
final long updateTimeDefault = Instant.now().toEpochMilli();
for (LongObjectCursor<Map<String, String>> cursor : results) {
long updateTime =
updateTimes.containsKey(cursor.key) ? updateTimes.get(cursor.key).toEpochMilli() : updateTimeDefault;
writer.write(objectMapper.writeValueAsString(List.of(
Long.toString(cursor.key),
cursor.value
cursor.value,
updateTime
)));
writer.write(System.lineSeparator());
}
@ -383,6 +433,7 @@ public class Wikidata {
public static class WikidataTranslations implements Translations.TranslationProvider {
private final LongObjectMap<Map<String, String>> data = Hppc.newLongObjectHashMap();
private final LongObjectMap<Instant> updateTimes = Hppc.newLongObjectHashMap();
public WikidataTranslations() {}
@ -391,11 +442,25 @@ public class Wikidata {
return data.get(qid);
}
public void clearUpdateTimes() {
updateTimes.clear();
}
/** Returns all maps from language code to translated name for {@code qid}. */
public LongObjectMap<Map<String, String>> getAll() {
return data;
}
/** Returns all maps from language code to translated name for {@code qid}. */
public LongObjectMap<Instant> getUpdateTimes() {
return updateTimes;
}
/** Stores a update date+time for {@code qid}. */
public void putUpdateTime(long qid, Instant updateTime) {
updateTimes.put(qid, updateTime);
}
/** Stores a name translation for {@code qid} in {@code lang}. */
public void put(long qid, String lang, String value) {
Map<String, String> map = data.get(qid);

Wyświetl plik

@ -19,6 +19,10 @@ import java.net.http.HttpResponse.BodySubscriber;
import java.net.http.HttpResponse.BodySubscribers;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.time.ZoneId;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -75,6 +79,17 @@ class WikidataTest {
}
}
""";
final String wikidataNamesLegacyJson = """
["1",{"en":"English 1","de":"Deutch 1"}]
["2",{"en":"English 2","de":"Deutch 2"}]
["3",{"en":"English 3","de":"Deutch 3"}]
""";
final Clock clock = Clock.fixed(Instant.ofEpochMilli(60_000), ZoneId.of("UTC"));
final String wikidataNamesJson = """
["1",{"en":"English 1","de":"Deutch 1"},55000]
["2",{"en":"English 2","de":"Deutch 2"},30000]
["3",{"en":"English 3","de":"Deutch 3"},30000]
""";
@Test
void testWikidataTranslations() {
@ -161,6 +176,33 @@ class WikidataTest {
assertInstanceOf(IOException.class, innerException);
}
@Test
void testLegacyWikidataNamesJson() throws IOException {
var reader = new BufferedReader(new StringReader(wikidataNamesLegacyJson));
// no timestamp + age limit set => all old => all should be dropped
var translationsProvider = Wikidata.load(reader, Duration.ofSeconds(1), 0, clock);
assertEquals(0, translationsProvider.getAll().size());
}
@Test
void testWikidataNamesJsonMaxAge() throws IOException {
// 10s => item 1 is 5s old hence fresh, the rest is 30s old hence outdated
Duration maxAge = Duration.ofSeconds(10);
var reader = new BufferedReader(new StringReader(wikidataNamesJson));
var translationsProvider = Wikidata.load(reader, maxAge, 0, clock);
assertEquals(1, translationsProvider.getAll().size());
}
@Test
void testWikidataNamesJsonUpdateLimit() throws IOException {
Duration maxAge = Duration.ofSeconds(1);
var reader = new BufferedReader(new StringReader(wikidataNamesJson));
var translationsProvider = Wikidata.load(reader, maxAge, 1, clock);
assertEquals(2, translationsProvider.getAll().size());
}
private static void assertEqualsIgnoringWhitespace(String expected, String actual) {
assertEquals(ignoreWhitespace(expected), ignoreWhitespace(actual));
}