kopia lustrzana https://github.com/onthegomap/planetiler
Extract ZIP file walker out of ShapefileReader. (#423)
rodzic
7adf46819a
commit
184f950051
|
@ -29,7 +29,6 @@ import java.io.IOException;
|
|||
import java.nio.file.FileSystem;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.PathMatcher;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -253,48 +252,53 @@ public class Planetiler {
|
|||
}
|
||||
|
||||
/**
|
||||
* Adds a new ESRI shapefile directory source that will process all files under {@param basePath} matching
|
||||
* {@param globPattern} using an explicit projection.
|
||||
* Adds a new ESRI shapefile glob source that will process all files under {@param basePath} matching
|
||||
* {@param globPattern}. {@param basePath} may be a directory or ZIP archive.
|
||||
*
|
||||
* @param sourceName string to use in stats and logs to identify this stage
|
||||
* @param basePath path to the directory containing shapefiles to process
|
||||
* @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
|
||||
* @return this runner instance for chaining
|
||||
* @see ShapefileReader
|
||||
*/
|
||||
public Planetiler addShapefileGlobSource(String sourceName, Path basePath, String globPattern) {
|
||||
return addShapefileGlobSource(null, sourceName, basePath, globPattern, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new ESRI shapefile glob source that will process all files under {@param basePath} matching
|
||||
* {@param globPattern} using an explicit projection. {@param basePath} may be a directory or ZIP archive.
|
||||
* <p>
|
||||
* If {@param globPattern} matches a ZIP archive, all files ending in {@code .shp} within the archive will be used for
|
||||
* this source.
|
||||
* <p>
|
||||
* If the file does not exist and {@code download=true} argument is set, then the file will first be downloaded from
|
||||
* {@code defaultUrl}.
|
||||
* <p>
|
||||
*
|
||||
* @param projection the Coordinate Reference System authority code to use, parsed with
|
||||
* {@link org.geotools.referencing.CRS#decode(String)}
|
||||
* @param sourceName string to use in stats and logs to identify this stage
|
||||
* @param basePath path to the directory containing shapefiles to process
|
||||
* @param basePath path to the directory or zip file containing shapefiles to process
|
||||
* @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
|
||||
* @param defaultUrl remote URL that the file to download if {@code download=true} argument is set and
|
||||
* {@code name_url} argument is not set
|
||||
* @return this runner instance for chaining
|
||||
* @see ShapefileReader
|
||||
*/
|
||||
public Planetiler addShapefileDirectorySource(String projection, String sourceName, Path basePath,
|
||||
String globPattern) {
|
||||
Path dirPath = getPath(sourceName, "shapefile directory", basePath, null);
|
||||
PathMatcher matcher = dirPath.getFileSystem().getPathMatcher("glob:" + globPattern);
|
||||
public Planetiler addShapefileGlobSource(String projection, String sourceName, Path basePath,
|
||||
String globPattern, String defaultUrl) {
|
||||
Path dirPath = getPath(sourceName, "shapefile glob", basePath, defaultUrl);
|
||||
|
||||
return addStage(sourceName, "Process all files matching " + dirPath + "/" + globPattern,
|
||||
ifSourceUsed(sourceName, () -> {
|
||||
try (
|
||||
var walk = Files.walk(dirPath);
|
||||
var sourcePaths = walk.filter(path -> matcher.matches(path.getFileName()))
|
||||
) {
|
||||
ShapefileReader.processWithProjection(projection, sourceName, sourcePaths.toList(), featureGroup, config,
|
||||
profile, stats);
|
||||
}
|
||||
var sourcePaths = FileUtils.walkPathWithPattern(basePath, globPattern,
|
||||
zipPath -> FileUtils.walkPathWithPattern(zipPath, "*.shp"));
|
||||
ShapefileReader.processWithProjection(projection, sourceName, sourcePaths, featureGroup, config,
|
||||
profile, stats);
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new ESRI shapefile directory source that will process all files under {@param basePath} matching
|
||||
* {@param globPattern}.
|
||||
*
|
||||
* @param sourceName string to use in stats and logs to identify this stage
|
||||
* @param basePath path to the directory containing shapefiles to process
|
||||
* @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
|
||||
* @return this runner instance for chaining
|
||||
* @see ShapefileReader
|
||||
*/
|
||||
public Planetiler addShapefileDirectorySource(String sourceName, Path basePath, String globPattern) {
|
||||
return addShapefileDirectorySource(null, sourceName, basePath, globPattern);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds a new ESRI shapefile source that will be processed with an explicit projection when {@link #run()} is called.
|
||||
|
@ -320,9 +324,14 @@ public class Planetiler {
|
|||
public Planetiler addShapefileSource(String projection, String name, Path defaultPath, String defaultUrl) {
|
||||
Path path = getPath(name, "shapefile", defaultPath, defaultUrl);
|
||||
return addStage(name, "Process features in " + path,
|
||||
ifSourceUsed(name,
|
||||
() -> ShapefileReader.processWithProjection(projection, name, List.of(path), featureGroup, config, profile,
|
||||
stats)));
|
||||
ifSourceUsed(name, () -> {
|
||||
List<Path> sourcePaths = List.of(path);
|
||||
if (FileUtils.hasExtension(path, "zip") || Files.isDirectory(path)) {
|
||||
sourcePaths = FileUtils.walkPathWithPattern(path, "*.shp");
|
||||
}
|
||||
|
||||
ShapefileReader.processWithProjection(projection, name, sourcePaths, featureGroup, config, profile, stats);
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -4,17 +4,12 @@ import com.onthegomap.planetiler.Profile;
|
|||
import com.onthegomap.planetiler.collection.FeatureGroup;
|
||||
import com.onthegomap.planetiler.config.PlanetilerConfig;
|
||||
import com.onthegomap.planetiler.stats.Stats;
|
||||
import com.onthegomap.planetiler.util.FileUtils;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.net.URI;
|
||||
import java.nio.file.FileSystems;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Stream;
|
||||
import org.geotools.data.FeatureSource;
|
||||
import org.geotools.data.shapefile.ShapefileDataStore;
|
||||
import org.geotools.feature.FeatureCollection;
|
||||
|
@ -96,34 +91,9 @@ public class ShapefileReader extends SimpleReader<SimpleFeature> {
|
|||
);
|
||||
}
|
||||
|
||||
private static URI findShpFile(Path path, Stream<Path> walkStream) {
|
||||
return walkStream
|
||||
.filter(z -> FileUtils.hasExtension(z, "shp"))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new IllegalArgumentException("No .shp file found inside " + path))
|
||||
.toUri();
|
||||
}
|
||||
|
||||
private ShapefileDataStore open(Path path) {
|
||||
try {
|
||||
URI uri;
|
||||
if (Files.isDirectory(path)) {
|
||||
try (var walkStream = Files.walk(path)) {
|
||||
uri = findShpFile(path, walkStream);
|
||||
}
|
||||
} else if (FileUtils.hasExtension(path, "zip")) {
|
||||
try (
|
||||
var zipFs = FileSystems.newFileSystem(path);
|
||||
var walkStream = FileUtils.walkFileSystem(zipFs)
|
||||
) {
|
||||
uri = findShpFile(path, walkStream);
|
||||
}
|
||||
} else if (FileUtils.hasExtension(path, "shp")) {
|
||||
uri = path.toUri();
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid shapefile input: " + path + " must be zip or shp");
|
||||
}
|
||||
var store = new ShapefileDataStore(uri.toURL());
|
||||
var store = new ShapefileDataStore(path.toUri().toURL());
|
||||
store.setTryCPGFile(true);
|
||||
return store;
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -5,12 +5,16 @@ import java.io.InputStream;
|
|||
import java.io.UncheckedIOException;
|
||||
import java.nio.file.FileStore;
|
||||
import java.nio.file.FileSystem;
|
||||
import java.nio.file.FileSystems;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.NoSuchFileException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.PathMatcher;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
import java.util.zip.ZipEntry;
|
||||
|
@ -22,6 +26,7 @@ import org.slf4j.LoggerFactory;
|
|||
* Convenience methods for working with files on disk.
|
||||
*/
|
||||
public class FileUtils {
|
||||
|
||||
private static final Format FORMAT = Format.defaultInstance();
|
||||
// Prevent zip-bomb attack, see https://rules.sonarsource.com/java/RSPEC-5042
|
||||
private static final int ZIP_THRESHOLD_ENTRIES = 10_000;
|
||||
|
@ -45,6 +50,61 @@ public class FileUtils {
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns list of paths matching {@param pattern} within {@param basePath}.
|
||||
* <p>
|
||||
* If {@param basePath} is a directory, then {@param walkZipFile} will be invoked for each matching {@code .zip} file
|
||||
* found. This function should return paths of interest within the zip file.
|
||||
*
|
||||
* @param basePath file path to recursively walk, either a directory or ZIP archive.
|
||||
* @param pattern pattern to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
|
||||
* @param walkZipFile callback function to recurse into matching {@code .zip} files.
|
||||
*/
|
||||
public static List<Path> walkPathWithPattern(Path basePath, String pattern,
|
||||
Function<Path, List<Path>> walkZipFile) {
|
||||
PathMatcher matcher = basePath.getFileSystem().getPathMatcher("glob:" + pattern);
|
||||
|
||||
try {
|
||||
if (FileUtils.hasExtension(basePath, "zip")) {
|
||||
try (
|
||||
var zipFs = FileSystems.newFileSystem(basePath);
|
||||
var walkStream = FileUtils.walkFileSystem(zipFs)
|
||||
) {
|
||||
return walkStream
|
||||
.filter(p -> p.getFileName() != null && matcher.matches(p.getFileName()))
|
||||
.toList();
|
||||
}
|
||||
} else if (Files.isDirectory(basePath)) {
|
||||
try (var walk = Files.walk(basePath)) {
|
||||
return walk
|
||||
.filter(path -> matcher.matches(path.getFileName()))
|
||||
.flatMap(path -> {
|
||||
if (FileUtils.hasExtension(path, "zip")) {
|
||||
return walkZipFile.apply(path).stream();
|
||||
} else {
|
||||
return Stream.of(path);
|
||||
}
|
||||
})
|
||||
.toList();
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("No files matching " + basePath + "/" + pattern);
|
||||
}
|
||||
} catch (IOException exc) {
|
||||
throw new UncheckedIOException(exc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns list of paths matching {@param pattern} within {@param basePath}.
|
||||
*
|
||||
* @param basePath file path to recursively walk, either a directory or ZIP archive.
|
||||
* @param pattern pattern to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
|
||||
*/
|
||||
public static List<Path> walkPathWithPattern(Path basePath, String pattern) {
|
||||
return walkPathWithPattern(basePath, pattern, zipPath -> List.of(zipPath));
|
||||
}
|
||||
|
||||
/** Returns true if {@code path} ends with ".extension" (case-insensitive). */
|
||||
public static boolean hasExtension(Path path, String extension) {
|
||||
return path.toString().toLowerCase().endsWith("." + extension.toLowerCase());
|
||||
|
|
|
@ -1700,14 +1700,17 @@ class PlanetilerTests {
|
|||
.setAttr("source", source.getSource());
|
||||
}
|
||||
})
|
||||
.addShapefileDirectorySource("shapefile-dir", resourceDir, "shape*.zip")
|
||||
// Match *.shp within [shapefile.zip, shapefile-copy.zip]
|
||||
.addShapefileGlobSource("shapefile-glob", resourceDir, "shape*.zip")
|
||||
// Match *.shp within shapefile.zip
|
||||
.addShapefileGlobSource("shapefile-glob-zip", resourceDir.resolve("shapefile.zip"), "*.shp")
|
||||
// Match *.shp within shapefile.zip
|
||||
.addShapefileSource("shapefile", resourceDir.resolve("shapefile.zip"))
|
||||
.setOutput("mbtiles", mbtiles)
|
||||
.run();
|
||||
|
||||
try (Mbtiles db = Mbtiles.newReadOnlyDatabase(mbtiles)) {
|
||||
long fileCount = 0;
|
||||
long dirCount = 0;
|
||||
long fileCount = 0, globCount = 0, globZipCount = 0;
|
||||
var tileMap = TestUtils.getTileMap(db);
|
||||
for (var tile : tileMap.values()) {
|
||||
for (var feature : tile) {
|
||||
|
@ -1715,15 +1718,17 @@ class PlanetilerTests {
|
|||
|
||||
switch ((String) feature.attrs().get("source")) {
|
||||
case "shapefile" -> fileCount++;
|
||||
case "shapefile-dir" -> dirCount++;
|
||||
case "shapefile-glob" -> globCount++;
|
||||
case "shapefile-glob-zip" -> globZipCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Input file was copied twice into test directory, directory source should have
|
||||
// 2x the number of features.
|
||||
assertTrue(fileCount > 0);
|
||||
assertEquals(2 * fileCount, dirCount);
|
||||
// `shapefile` and `shapefile-glob-zip` both match only one file.
|
||||
assertEquals(fileCount, globZipCount);
|
||||
// `shapefile-glob` matches two input files, should have 2x number of features of `shapefile`.
|
||||
assertEquals(2 * fileCount, globCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -27,18 +27,11 @@ class ShapefileReaderTest {
|
|||
@TempDir
|
||||
private Path tempDir;
|
||||
|
||||
@Test
|
||||
@Timeout(30)
|
||||
void testReadShapefile() {
|
||||
testReadShapefile(TestUtils.pathToResource("shapefile.zip"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Timeout(30)
|
||||
@DisabledOnOs(OS.WINDOWS) // the zip file doesn't fully close, which causes trouble running test on windows
|
||||
void testReadShapefileExtracted() throws IOException {
|
||||
var extracted = TestUtils.extractPathToResource(tempDir, "shapefile.zip");
|
||||
testReadShapefile(extracted);
|
||||
try (var fs = FileSystems.newFileSystem(extracted)) {
|
||||
var path = fs.getPath("shapefile", "stations.shp");
|
||||
testReadShapefile(path);
|
||||
|
@ -50,7 +43,6 @@ class ShapefileReaderTest {
|
|||
void testReadShapefileUnzipped() throws IOException {
|
||||
var dest = tempDir.resolve("shapefile.zip");
|
||||
FileUtils.unzipResource("/shapefile.zip", dest);
|
||||
testReadShapefile(dest);
|
||||
testReadShapefile(dest.resolve("shapefile").resolve("stations.shp"));
|
||||
}
|
||||
|
||||
|
|
|
@ -4,11 +4,15 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import com.onthegomap.planetiler.TestUtils;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
|
@ -85,4 +89,57 @@ class FileUtilsTest {
|
|||
Files.readString(dest.resolve("shapefile").resolve("stations.cpg"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWalkPathWithPatternDirectory() throws IOException {
|
||||
Path parent = tmpDir.resolve(Path.of("a", "b", "c"));
|
||||
FileUtils.createDirectory(parent);
|
||||
|
||||
List<Path> txtFiles = Stream.of("1.txt", "2.txt").map(parent::resolve).toList();
|
||||
|
||||
for (var file : txtFiles) {
|
||||
Files.write(file, new byte[]{});
|
||||
}
|
||||
|
||||
Files.write(parent.resolve("something-that-doesnt-match.blah"), new byte[]{});
|
||||
|
||||
var matchingPaths = FileUtils.walkPathWithPattern(parent, "*.txt");
|
||||
|
||||
assertEquals(
|
||||
txtFiles.stream().sorted().toList(),
|
||||
matchingPaths.stream().sorted().toList()
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWalkPathWithPatternDirectoryZip() throws IOException {
|
||||
Path parent = tmpDir.resolve(Path.of("a", "b", "c"));
|
||||
FileUtils.createDirectory(parent);
|
||||
|
||||
Path zipFile = parent.resolve("fake-zip-file.zip");
|
||||
|
||||
Files.write(zipFile, new byte[]{});
|
||||
Files.write(parent.resolve("something-that-doesnt-match.blah"), new byte[]{});
|
||||
|
||||
Function<Path, List<Path>> mockWalkZipFile = zipPath -> List.of(zipPath.resolve("inner.txt"));
|
||||
|
||||
// When we don't provide a callback to recurse into zip files, the path to the zip
|
||||
// itself should be returned.
|
||||
assertEquals(List.of(zipFile), FileUtils.walkPathWithPattern(parent, "*.zip"));
|
||||
|
||||
// Otherwise, the files inside the zip should be returned.
|
||||
assertEquals(List.of(zipFile.resolve("inner.txt")),
|
||||
FileUtils.walkPathWithPattern(parent, "*.zip", mockWalkZipFile));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWalkPathWithPatternSingleZip() {
|
||||
Path zipPath = TestUtils.pathToResource("shapefile.zip");
|
||||
|
||||
var matchingPaths = FileUtils.walkPathWithPattern(zipPath, "stations.sh[px]");
|
||||
|
||||
assertEquals(
|
||||
List.of("/shapefile/stations.shp", "/shapefile/stations.shx"),
|
||||
matchingPaths.stream().map(Path::toString).sorted().toList());
|
||||
}
|
||||
}
|
||||
|
|
Ładowanie…
Reference in New Issue