diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java index df7b0254..e0fbae09 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java @@ -523,6 +523,24 @@ public class VectorTile { return !empty; } + /** + * Determine whether a tile is likely to be a duplicate of some other tile hence it makes sense to calculate a hash + * for it. + *

+ * Deduplication code is aiming for a balance between filtering-out all duplicates and not spending too much CPU on + * hash calculations: calculating hashes for all tiles costs too much CPU, not calculating hashes at all means + * generating mbtiles which are too big. This method is responsible for achieving that balance. + *

+ * Current understanding is, that for the whole planet, there are 267m total tiles and 38m unique tiles. The + * {@link #containsOnlyFillsOrEdges()} heuristic catches >99.9% of repeated tiles and cuts down the number of tile + * hashes we need to track by 98% (38m to 735k). So it is considered a good tradeoff. + * + * @return {@code true} if the tile might have duplicates hence we want to calculate a hash for it + */ + public boolean likelyToBeDuplicated() { + return layers.values().stream().allMatch(v -> v.encodedFeatures.isEmpty()) || containsOnlyFillsOrEdges(); + } + private enum Command { MOVE_TO(1), LINE_TO(2), diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java index 46892bf1..a2b51075 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java @@ -12,7 +12,6 @@ import com.onthegomap.planetiler.stats.Stats; import com.onthegomap.planetiler.util.CloseableConusmer; import com.onthegomap.planetiler.util.CommonStringEncoder; import com.onthegomap.planetiler.util.DiskBacked; -import com.onthegomap.planetiler.util.Hashing; import com.onthegomap.planetiler.util.LayerStats; import com.onthegomap.planetiler.worker.Worker; import java.io.Closeable; @@ -368,22 +367,6 @@ public final class FeatureGroup implements Iterable, return tileCoord; } - /** - * Generates a hash over the feature's relevant data: layer, geometry, and attributes. The coordinates are - * not part of the hash. - *

- * Used as an optimization to avoid writing the same (ocean) tiles over and over again. - */ - public long generateContentHash() { - long hash = Hashing.FNV1_64_INIT; - for (var feature : entries) { - byte layerId = extractLayerIdFromKey(feature.key()); - hash = Hashing.fnv1a64(hash, layerId); - hash = Hashing.fnv1a64(hash, feature.value()); - } - return hash; - } - /** * Returns true if {@code other} contains features with identical layer, geometry, and attributes, as this tile - * even if the tiles have separate coordinates. diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java index 61dbca8c..0f523937 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java @@ -16,6 +16,7 @@ import com.onthegomap.planetiler.stats.Timer; import com.onthegomap.planetiler.util.DiskBacked; import com.onthegomap.planetiler.util.FileUtils; import com.onthegomap.planetiler.util.Format; +import com.onthegomap.planetiler.util.Hashing; import com.onthegomap.planetiler.util.LayerStats; import com.onthegomap.planetiler.worker.WorkQueue; import com.onthegomap.planetiler.worker.Worker; @@ -289,8 +290,8 @@ public class MbtilesWriter { lastEncoded = encoded; lastBytes = bytes; last = tileFeatures; - if (compactDb && en.containsOnlyFillsOrEdges()) { - tileDataHash = tileFeatures.generateContentHash(); + if (compactDb && en.likelyToBeDuplicated() && bytes != null) { + tileDataHash = generateContentHash(bytes); } else { tileDataHash = null; } @@ -412,6 +413,15 @@ public class MbtilesWriter { return Stream.of(tilesByZoom).mapToLong(c -> c.get()).sum(); } + /** + * Generates a hash over encoded and compressed tile. + *

+ * Used as an optimization to avoid writing the same (mostly ocean) tiles over and over again. + */ + public static long generateContentHash(byte[] bytes) { + return Hashing.fnv1a64(bytes); + } + /** * Container for a batch of tiles to be processed together in the encoder and writer threads. *

diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java index cf347194..13defbe5 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/TestUtils.java @@ -27,10 +27,12 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashSet; @@ -42,6 +44,7 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.function.Function; import java.util.stream.Collectors; +import org.apache.commons.lang3.reflect.FieldUtils; import org.locationtech.jts.algorithm.Orientation; import org.locationtech.jts.geom.Coordinate; import org.locationtech.jts.geom.CoordinateSequence; @@ -685,4 +688,33 @@ public class TestUtils { fail(e); } } + + public static void assertTileDuplicates(Mbtiles db, int expected) { + try { + Connection connection = (Connection) FieldUtils.readField(db, "connection", true); + Statement statement = connection.createStatement(); + ResultSet rs = statement.executeQuery("SELECT tile_data FROM tiles_data"); + ArrayList tilesList = new ArrayList<>(); + while (rs.next()) { + tilesList.add(rs.getBytes("tile_data")); + } + + var tiles = tilesList.toArray(new byte[0][0]); + Set dups = new HashSet<>(); + for (int i = 0; i < tiles.length; i++) { + for (int j = i + 1; j < tiles.length; j++) { + if (Arrays.equals(tiles[i], tiles[j])) { + if (!dups.contains(j)) { + dups.add(j); + } + } + } + } + + int dupCount = dups.size(); + assertEquals(expected, dupCount, "%d duplicates expected, %d found".formatted(expected, dupCount)); + } catch (IllegalAccessException | SQLException e) { + fail(e); + } + } } diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/collection/FeatureGroupTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/collection/FeatureGroupTest.java index a3f49da4..a5f89f1f 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/collection/FeatureGroupTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/collection/FeatureGroupTest.java @@ -12,9 +12,12 @@ import com.onthegomap.planetiler.Profile; import com.onthegomap.planetiler.VectorTile; import com.onthegomap.planetiler.geo.GeometryType; import com.onthegomap.planetiler.geo.TileCoord; +import com.onthegomap.planetiler.mbtiles.MbtilesWriter; import com.onthegomap.planetiler.render.RenderedFeature; import com.onthegomap.planetiler.stats.Stats; import com.onthegomap.planetiler.util.CloseableConusmer; +import com.onthegomap.planetiler.util.Gzip; +import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -356,17 +359,22 @@ class FeatureGroupTest { @ParameterizedTest(name = "{0}") @ArgumentsSource(SameFeatureGroupTestArgs.class) - void testGenerateContentHash(String testName, boolean expectSame, PuTileArgs args0, PuTileArgs args1) { + void testGenerateContentHash(String testName, boolean expectSame, PuTileArgs args0, PuTileArgs args1) + throws IOException { put(args0); put(args1); sorter.sort(); var iter = features.iterator(); - var tile0 = iter.next(); - var tile1 = iter.next(); + var tileHash0 = MbtilesWriter.generateContentHash( + Gzip.gzip(iter.next().getVectorTileEncoder().encode()) + ); + var tileHash1 = MbtilesWriter.generateContentHash( + Gzip.gzip(iter.next().getVectorTileEncoder().encode()) + ); if (expectSame) { - assertEquals(tile0.generateContentHash(), tile1.generateContentHash()); + assertEquals(tileHash0, tileHash1); } else { - assertNotEquals(tile0.generateContentHash(), tile1.generateContentHash()); + assertNotEquals(tileHash0, tileHash1); } } diff --git a/planetiler-examples/src/test/java/com/onthegomap/planetiler/examples/BikeRouteOverlayTest.java b/planetiler-examples/src/test/java/com/onthegomap/planetiler/examples/BikeRouteOverlayTest.java index 7691e2ab..525c1278 100644 --- a/planetiler-examples/src/test/java/com/onthegomap/planetiler/examples/BikeRouteOverlayTest.java +++ b/planetiler-examples/src/test/java/com/onthegomap/planetiler/examples/BikeRouteOverlayTest.java @@ -110,6 +110,8 @@ class BikeRouteOverlayTest { "name", "EuroVelo 8 - Mediterranean Route - part Monaco", "ref", "EV8" ), GeoUtils.WORLD_LAT_LON_BOUNDS, 25, LineString.class); + + TestUtils.assertTileDuplicates(mbtiles, 0); } } }