planetiler/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/LongLongMap.java

351 wiersze
11 KiB
Java

package com.onthegomap.planetiler.collection;
import static com.onthegomap.planetiler.util.MemoryEstimator.estimateSize;
import com.carrotsearch.hppc.ByteArrayList;
import com.onthegomap.planetiler.util.DiskBacked;
import com.onthegomap.planetiler.util.FileUtils;
import com.onthegomap.planetiler.util.MemoryEstimator;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
/**
* A map that stores a single {@code long} value for each OSM node. A single thread writes the values for each node ID
* sequentially then multiple threads can read values concurrently.
* <p>
* Three implementations are provided: {@link #noop()} which ignores writes and throws on reads, {@link SortedTable}
* which stores node IDs and values sorted by node ID and does binary search on lookup, and {@link SparseArray} which
* only stores values and uses the node ID as the index into the array (with some compression to avoid storing many
* sequential 0's).
* <p>
* Use {@link SortedTable} for small OSM extracts and {@link SparseArray} when processing the entire planet.
* <p>
* Each implementation can be backed by either {@link AppendStoreRam} to store data in RAM or {@link AppendStoreMmap} to
* store data in a memory-mapped file.
*/
public interface LongLongMap extends Closeable, MemoryEstimator.HasEstimate, DiskBacked {
/*
* Idea graveyard (all too slow):
* - rocksdb
* - mapdb sorted table
* - sqlite table with key and value columns
*/
long MISSING_VALUE = Long.MIN_VALUE;
/**
* Returns a new longlong map from config strings.
*
* @param name which implementation to use: {@code "noop"}, {@code "sortedtable"} or {@code "sparsearray"}
* @param storage how to store data: {@code "ram"} or {@code "mmap"}
* @param path where to store data (if mmap)
* @return A longlong map instance
* @throws IllegalArgumentException if {@code name} or {@code storage} is not valid
*/
static LongLongMap from(String name, String storage, Path path) {
boolean ram = isRam(storage);
return switch (name) {
case "noop" -> noop();
case "sortedtable" -> ram ? newInMemorySortedTable() : newDiskBackedSortedTable(path);
case "sparsearray" -> ram ? newInMemorySparseArray() : newDiskBackedSparseArray(path);
default -> throw new IllegalArgumentException("Unexpected value: " + name);
};
}
/** Estimates the number of bytes of RAM this nodemap will use for a given OSM input file. */
static long estimateMemoryUsage(String name, String storage, long osmFileSize) {
boolean ram = isRam(storage);
long nodes = estimateNumNodes(osmFileSize);
return switch (name) {
case "noop" -> 0;
case "sortedtable" -> 300_000_000L + (ram ? 12 * nodes : 0L);
case "sparsearray" -> 300_000_000L + (ram ? 9 * nodes : 0L);
default -> throw new IllegalArgumentException("Unexpected value: " + name);
};
}
/** Estimates the number of bytes of disk this nodemap will use for a given OSM input file. */
static long estimateDiskUsage(String name, String storage, long osmFileSize) {
if (isRam(storage)) {
return 0;
} else {
long nodes = estimateNumNodes(osmFileSize);
return switch (name) {
case "noop" -> 0;
case "sortedtable" -> 12 * nodes;
case "sparsearray" -> 9 * nodes;
default -> throw new IllegalArgumentException("Unexpected value: " + name);
};
}
}
private static boolean isRam(String storage) {
return switch (storage) {
case "ram" -> true;
case "mmap" -> false;
default -> throw new IllegalArgumentException("Unexpected storage value: " + storage);
};
}
private static long estimateNumNodes(long osmFileSize) {
// In February 2022, planet.pbf was 62GB with 750m nodes, so scale from there
return Math.round(750_000_000d * (osmFileSize / 62_000_000_000d));
}
/** Returns a longlong map that stores no data and throws on read */
static LongLongMap noop() {
return new LongLongMap() {
@Override
public void put(long key, long value) {}
@Override
public long get(long key) {
throw new UnsupportedOperationException("get");
}
@Override
public long diskUsageBytes() {
return 0;
}
@Override
public void close() {}
};
}
/** Returns an in-memory longlong map that uses 12-bytes per node and binary search to find values. */
static LongLongMap newInMemorySortedTable() {
return new SortedTable(
new AppendStore.SmallLongs(i -> new AppendStoreRam.Ints()),
new AppendStoreRam.Longs()
);
}
/** Returns a memory-mapped longlong map that uses 12-bytes per node and binary search to find values. */
static LongLongMap newDiskBackedSortedTable(Path dir) {
FileUtils.createDirectory(dir);
return new SortedTable(
new AppendStore.SmallLongs(i -> new AppendStoreMmap.Ints(dir.resolve("keys-" + i))),
new AppendStoreMmap.Longs(dir.resolve("values"))
);
}
/**
* Returns an in-memory longlong map that uses 8-bytes per node and O(1) lookup but wastes space storing lots of 0's
* when the key space is fragmented.
*/
static LongLongMap newInMemorySparseArray() {
return new SparseArray(new AppendStoreRam.Longs());
}
/**
* Returns a memory-mapped longlong map that uses 8-bytes per node and O(1) lookup but wastes space storing lots of
* 0's when the key space is fragmented.
*/
static LongLongMap newDiskBackedSparseArray(Path path) {
return new SparseArray(new AppendStoreMmap.Longs(path));
}
/**
* Writes the value for a key. Not thread safe! All writes must come from a single thread, in order by key. No writes
* can be performed after the first read.
*/
void put(long key, long value);
/**
* Returns the value for a key. Safe to be called by multiple threads after all values have been written. After the
* first read, all writes will fail.
*/
long get(long key);
@Override
default long diskUsageBytes() {
return 0;
}
@Override
default long estimateMemoryUsageBytes() {
return 0;
}
default long[] multiGet(long[] key) {
long[] result = new long[key.length];
for (int i = 0; i < key.length; i++) {
result[i] = get(key[i]);
}
return result;
}
/**
* A longlong map that stores keys and values sorted by key and does a binary search to lookup values.
*/
class SortedTable implements LongLongMap {
/*
* It's not actually a binary search, it keeps track of the first index of each block of 256 keys, so it
* can do an O(1) lookup to narrow down the search space to 256 values.
*/
private final AppendStore.Longs offsets = new AppendStoreRam.Longs();
private final AppendStore.Longs keys;
private final AppendStore.Longs values;
private long lastChunk = -1;
private long lastKey = -1;
public SortedTable(AppendStore.Longs keys, AppendStore.Longs values) {
this.keys = keys;
this.values = values;
}
@Override
public void put(long key, long value) {
if (key <= lastKey) {
throw new IllegalArgumentException("Nodes must be sorted ascending by ID, " + key + " came after " + lastKey);
}
lastKey = key;
long idx = keys.size();
long chunk = key >>> 8;
if (chunk != lastChunk) {
while (offsets.size() <= chunk) {
offsets.appendLong(idx);
}
lastChunk = chunk;
}
keys.appendLong(key);
values.appendLong(value);
}
@Override
public long get(long key) {
long chunk = key >>> 8;
if (chunk >= offsets.size()) {
return MISSING_VALUE;
}
// use the "offsets" index to narrow search space to <256 values
long lo = offsets.getLong(chunk);
long hi = Math.min(keys.size(), chunk >= offsets.size() - 1 ? keys.size() : offsets.getLong(chunk + 1)) - 1;
while (lo <= hi) {
long idx = (lo + hi) >>> 1;
long value = keys.getLong(idx);
if (value < key) {
lo = idx + 1;
} else if (value > key) {
hi = idx - 1;
} else {
// found
return values.getLong(idx);
}
}
return MISSING_VALUE;
}
@Override
public long diskUsageBytes() {
return keys.diskUsageBytes() + values.diskUsageBytes();
}
@Override
public long estimateMemoryUsageBytes() {
return keys.estimateMemoryUsageBytes() + values.estimateMemoryUsageBytes() + offsets.estimateMemoryUsageBytes();
}
@Override
public void close() throws IOException {
keys.close();
values.close();
offsets.close();
}
}
/**
* A longlong map that only stores values and uses the key as an index into the array, with some tweaks to avoid
* storing many sequential 0's.
*/
class SparseArray implements LongLongMap {
// The key space is broken into chunks of 256 and for each chunk, store:
// 1) the index in the outputs array for the first key in the block
private final AppendStore.Longs offsets = new AppendStoreRam.Longs();
// 2) the number of leading 0's at the start of each block
private final ByteArrayList offsetStartPad = new ByteArrayList();
private final AppendStore.Longs values;
private int lastChunk = -1;
private int lastOffset = 0;
private long lastKey = -1;
public SparseArray(AppendStore.Longs values) {
this.values = values;
}
@Override
public void put(long key, long value) {
if (key <= lastKey) {
throw new IllegalArgumentException("Nodes must be sorted ascending by ID, " + key + " came after " + lastKey);
}
lastKey = key;
long idx = values.size();
int chunk = (int) (key >>> 8);
int offset = (int) (key & 255);
if (chunk != lastChunk) {
// new chunk, store offset and leading zeros
lastOffset = offset;
while (offsets.size() <= chunk) {
offsets.appendLong(idx);
offsetStartPad.add((byte) offset);
}
lastChunk = chunk;
} else {
// same chunk, write not_founds until we get to right idx
while (++lastOffset < offset) {
values.appendLong(MISSING_VALUE);
}
}
values.appendLong(value);
}
@Override
public long get(long key) {
int chunk = (int) (key >>> 8);
int offset = (int) (key & 255);
if (chunk >= offsets.size()) {
return MISSING_VALUE;
}
long lo = offsets.getLong(chunk);
long hi = Math.min(values.size(), chunk >= offsets.size() - 1 ? values.size() : offsets.getLong(chunk + 1)) - 1;
int startPad = offsetStartPad.get(chunk) & 255;
long index = lo + offset - startPad;
if (index > hi || index < lo) {
return MISSING_VALUE;
}
return values.getLong(index);
}
@Override
public long diskUsageBytes() {
return values.diskUsageBytes();
}
@Override
public long estimateMemoryUsageBytes() {
return values.estimateMemoryUsageBytes() + estimateSize(offsets) + estimateSize(offsetStartPad);
}
@Override
public void close() throws IOException {
offsetStartPad.release();
values.close();
offsets.close();
}
}
}