Tile hashing fix (#436)

pull/447/head
Peter Hanecak 2023-01-14 22:03:50 +01:00 zatwierdzone przez GitHub
rodzic 9fbf952239
commit 4a622a8ef0
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
6 zmienionych plików z 77 dodań i 24 usunięć

Wyświetl plik

@ -523,6 +523,24 @@ public class VectorTile {
return !empty;
}
/**
* Determine whether a tile is likely to be a duplicate of some other tile hence it makes sense to calculate a hash
* for it.
* <p>
* Deduplication code is aiming for a balance between filtering-out all duplicates and not spending too much CPU on
* hash calculations: calculating hashes for all tiles costs too much CPU, not calculating hashes at all means
* generating mbtiles which are too big. This method is responsible for achieving that balance.
* <p>
* Current understanding is, that for the whole planet, there are 267m total tiles and 38m unique tiles. The
* {@link #containsOnlyFillsOrEdges()} heuristic catches >99.9% of repeated tiles and cuts down the number of tile
* hashes we need to track by 98% (38m to 735k). So it is considered a good tradeoff.
*
* @return {@code true} if the tile might have duplicates hence we want to calculate a hash for it
*/
public boolean likelyToBeDuplicated() {
return layers.values().stream().allMatch(v -> v.encodedFeatures.isEmpty()) || containsOnlyFillsOrEdges();
}
private enum Command {
MOVE_TO(1),
LINE_TO(2),

Wyświetl plik

@ -12,7 +12,6 @@ import com.onthegomap.planetiler.stats.Stats;
import com.onthegomap.planetiler.util.CloseableConusmer;
import com.onthegomap.planetiler.util.CommonStringEncoder;
import com.onthegomap.planetiler.util.DiskBacked;
import com.onthegomap.planetiler.util.Hashing;
import com.onthegomap.planetiler.util.LayerStats;
import com.onthegomap.planetiler.worker.Worker;
import java.io.Closeable;
@ -368,22 +367,6 @@ public final class FeatureGroup implements Iterable<FeatureGroup.TileFeatures>,
return tileCoord;
}
/**
* Generates a hash over the feature's relevant data: layer, geometry, and attributes. The coordinates are
* <b>not</b> part of the hash.
* <p>
* Used as an optimization to avoid writing the same (ocean) tiles over and over again.
*/
public long generateContentHash() {
long hash = Hashing.FNV1_64_INIT;
for (var feature : entries) {
byte layerId = extractLayerIdFromKey(feature.key());
hash = Hashing.fnv1a64(hash, layerId);
hash = Hashing.fnv1a64(hash, feature.value());
}
return hash;
}
/**
* Returns true if {@code other} contains features with identical layer, geometry, and attributes, as this tile -
* even if the tiles have separate coordinates.

Wyświetl plik

@ -16,6 +16,7 @@ import com.onthegomap.planetiler.stats.Timer;
import com.onthegomap.planetiler.util.DiskBacked;
import com.onthegomap.planetiler.util.FileUtils;
import com.onthegomap.planetiler.util.Format;
import com.onthegomap.planetiler.util.Hashing;
import com.onthegomap.planetiler.util.LayerStats;
import com.onthegomap.planetiler.worker.WorkQueue;
import com.onthegomap.planetiler.worker.Worker;
@ -289,8 +290,8 @@ public class MbtilesWriter {
lastEncoded = encoded;
lastBytes = bytes;
last = tileFeatures;
if (compactDb && en.containsOnlyFillsOrEdges()) {
tileDataHash = tileFeatures.generateContentHash();
if (compactDb && en.likelyToBeDuplicated() && bytes != null) {
tileDataHash = generateContentHash(bytes);
} else {
tileDataHash = null;
}
@ -412,6 +413,15 @@ public class MbtilesWriter {
return Stream.of(tilesByZoom).mapToLong(c -> c.get()).sum();
}
/**
* Generates a hash over encoded and compressed tile.
* <p>
* Used as an optimization to avoid writing the same (mostly ocean) tiles over and over again.
*/
public static long generateContentHash(byte[] bytes) {
return Hashing.fnv1a64(bytes);
}
/**
* Container for a batch of tiles to be processed together in the encoder and writer threads.
* <p>

Wyświetl plik

@ -27,10 +27,12 @@ import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
@ -42,6 +44,7 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.locationtech.jts.algorithm.Orientation;
import org.locationtech.jts.geom.Coordinate;
import org.locationtech.jts.geom.CoordinateSequence;
@ -685,4 +688,33 @@ public class TestUtils {
fail(e);
}
}
public static void assertTileDuplicates(Mbtiles db, int expected) {
try {
Connection connection = (Connection) FieldUtils.readField(db, "connection", true);
Statement statement = connection.createStatement();
ResultSet rs = statement.executeQuery("SELECT tile_data FROM tiles_data");
ArrayList<byte[]> tilesList = new ArrayList<>();
while (rs.next()) {
tilesList.add(rs.getBytes("tile_data"));
}
var tiles = tilesList.toArray(new byte[0][0]);
Set<Integer> dups = new HashSet<>();
for (int i = 0; i < tiles.length; i++) {
for (int j = i + 1; j < tiles.length; j++) {
if (Arrays.equals(tiles[i], tiles[j])) {
if (!dups.contains(j)) {
dups.add(j);
}
}
}
}
int dupCount = dups.size();
assertEquals(expected, dupCount, "%d duplicates expected, %d found".formatted(expected, dupCount));
} catch (IllegalAccessException | SQLException e) {
fail(e);
}
}
}

Wyświetl plik

@ -12,9 +12,12 @@ import com.onthegomap.planetiler.Profile;
import com.onthegomap.planetiler.VectorTile;
import com.onthegomap.planetiler.geo.GeometryType;
import com.onthegomap.planetiler.geo.TileCoord;
import com.onthegomap.planetiler.mbtiles.MbtilesWriter;
import com.onthegomap.planetiler.render.RenderedFeature;
import com.onthegomap.planetiler.stats.Stats;
import com.onthegomap.planetiler.util.CloseableConusmer;
import com.onthegomap.planetiler.util.Gzip;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@ -356,17 +359,22 @@ class FeatureGroupTest {
@ParameterizedTest(name = "{0}")
@ArgumentsSource(SameFeatureGroupTestArgs.class)
void testGenerateContentHash(String testName, boolean expectSame, PuTileArgs args0, PuTileArgs args1) {
void testGenerateContentHash(String testName, boolean expectSame, PuTileArgs args0, PuTileArgs args1)
throws IOException {
put(args0);
put(args1);
sorter.sort();
var iter = features.iterator();
var tile0 = iter.next();
var tile1 = iter.next();
var tileHash0 = MbtilesWriter.generateContentHash(
Gzip.gzip(iter.next().getVectorTileEncoder().encode())
);
var tileHash1 = MbtilesWriter.generateContentHash(
Gzip.gzip(iter.next().getVectorTileEncoder().encode())
);
if (expectSame) {
assertEquals(tile0.generateContentHash(), tile1.generateContentHash());
assertEquals(tileHash0, tileHash1);
} else {
assertNotEquals(tile0.generateContentHash(), tile1.generateContentHash());
assertNotEquals(tileHash0, tileHash1);
}
}

Wyświetl plik

@ -110,6 +110,8 @@ class BikeRouteOverlayTest {
"name", "EuroVelo 8 - Mediterranean Route - part Monaco",
"ref", "EV8"
), GeoUtils.WORLD_LAT_LON_BOUNDS, 25, LineString.class);
TestUtils.assertTileDuplicates(mbtiles, 0);
}
}
}