diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java index df7b0254..e0fbae09 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java @@ -523,6 +523,24 @@ public class VectorTile { return !empty; } + /** + * Determine whether a tile is likely to be a duplicate of some other tile hence it makes sense to calculate a hash + * for it. + *
+ * Deduplication code is aiming for a balance between filtering-out all duplicates and not spending too much CPU on + * hash calculations: calculating hashes for all tiles costs too much CPU, not calculating hashes at all means + * generating mbtiles which are too big. This method is responsible for achieving that balance. + *
+ * Current understanding is, that for the whole planet, there are 267m total tiles and 38m unique tiles. The
+ * {@link #containsOnlyFillsOrEdges()} heuristic catches >99.9% of repeated tiles and cuts down the number of tile
+ * hashes we need to track by 98% (38m to 735k). So it is considered a good tradeoff.
+ *
+ * @return {@code true} if the tile might have duplicates hence we want to calculate a hash for it
+ */
+ public boolean likelyToBeDuplicated() {
+ return layers.values().stream().allMatch(v -> v.encodedFeatures.isEmpty()) || containsOnlyFillsOrEdges();
+ }
+
private enum Command {
MOVE_TO(1),
LINE_TO(2),
diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java
index 46892bf1..a2b51075 100644
--- a/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java
+++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java
@@ -12,7 +12,6 @@ import com.onthegomap.planetiler.stats.Stats;
import com.onthegomap.planetiler.util.CloseableConusmer;
import com.onthegomap.planetiler.util.CommonStringEncoder;
import com.onthegomap.planetiler.util.DiskBacked;
-import com.onthegomap.planetiler.util.Hashing;
import com.onthegomap.planetiler.util.LayerStats;
import com.onthegomap.planetiler.worker.Worker;
import java.io.Closeable;
@@ -368,22 +367,6 @@ public final class FeatureGroup implements Iterable
- * Used as an optimization to avoid writing the same (ocean) tiles over and over again.
- */
- public long generateContentHash() {
- long hash = Hashing.FNV1_64_INIT;
- for (var feature : entries) {
- byte layerId = extractLayerIdFromKey(feature.key());
- hash = Hashing.fnv1a64(hash, layerId);
- hash = Hashing.fnv1a64(hash, feature.value());
- }
- return hash;
- }
-
/**
* Returns true if {@code other} contains features with identical layer, geometry, and attributes, as this tile -
* even if the tiles have separate coordinates.
diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java
index 61dbca8c..0f523937 100644
--- a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java
+++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java
@@ -16,6 +16,7 @@ import com.onthegomap.planetiler.stats.Timer;
import com.onthegomap.planetiler.util.DiskBacked;
import com.onthegomap.planetiler.util.FileUtils;
import com.onthegomap.planetiler.util.Format;
+import com.onthegomap.planetiler.util.Hashing;
import com.onthegomap.planetiler.util.LayerStats;
import com.onthegomap.planetiler.worker.WorkQueue;
import com.onthegomap.planetiler.worker.Worker;
@@ -289,8 +290,8 @@ public class MbtilesWriter {
lastEncoded = encoded;
lastBytes = bytes;
last = tileFeatures;
- if (compactDb && en.containsOnlyFillsOrEdges()) {
- tileDataHash = tileFeatures.generateContentHash();
+ if (compactDb && en.likelyToBeDuplicated() && bytes != null) {
+ tileDataHash = generateContentHash(bytes);
} else {
tileDataHash = null;
}
@@ -412,6 +413,15 @@ public class MbtilesWriter {
return Stream.of(tilesByZoom).mapToLong(c -> c.get()).sum();
}
+ /**
+ * Generates a hash over encoded and compressed tile.
+ *
+ * Used as an optimization to avoid writing the same (mostly ocean) tiles over and over again.
+ */
+ public static long generateContentHash(byte[] bytes) {
+ return Hashing.fnv1a64(bytes);
+ }
+
/**
* Container for a batch of tiles to be processed together in the encoder and writer threads.
*
diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java
index cf347194..13defbe5 100644
--- a/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java
+++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java
@@ -27,10 +27,12 @@ import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
@@ -42,6 +44,7 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.stream.Collectors;
+import org.apache.commons.lang3.reflect.FieldUtils;
import org.locationtech.jts.algorithm.Orientation;
import org.locationtech.jts.geom.Coordinate;
import org.locationtech.jts.geom.CoordinateSequence;
@@ -685,4 +688,33 @@ public class TestUtils {
fail(e);
}
}
+
+ public static void assertTileDuplicates(Mbtiles db, int expected) {
+ try {
+ Connection connection = (Connection) FieldUtils.readField(db, "connection", true);
+ Statement statement = connection.createStatement();
+ ResultSet rs = statement.executeQuery("SELECT tile_data FROM tiles_data");
+ ArrayList