diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java index 2a0bf048..ba9ba66e 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java @@ -29,7 +29,6 @@ import java.io.IOException; import java.nio.file.FileSystem; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.PathMatcher; import java.time.Instant; import java.util.ArrayList; import java.util.List; @@ -253,48 +252,53 @@ public class Planetiler { } /** - * Adds a new ESRI shapefile directory source that will process all files under {@param basePath} matching - * {@param globPattern} using an explicit projection. + * Adds a new ESRI shapefile glob source that will process all files under {@param basePath} matching + * {@param globPattern}. {@param basePath} may be a directory or ZIP archive. + * + * @param sourceName string to use in stats and logs to identify this stage + * @param basePath path to the directory containing shapefiles to process + * @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}. + * @return this runner instance for chaining + * @see ShapefileReader + */ + public Planetiler addShapefileGlobSource(String sourceName, Path basePath, String globPattern) { + return addShapefileGlobSource(null, sourceName, basePath, globPattern, null); + } + + /** + * Adds a new ESRI shapefile glob source that will process all files under {@param basePath} matching + * {@param globPattern} using an explicit projection. {@param basePath} may be a directory or ZIP archive. + *

+ * If {@param globPattern} matches a ZIP archive, all files ending in {@code .shp} within the archive will be used for + * this source. + *

+ * If the file does not exist and {@code download=true} argument is set, then the file will first be downloaded from + * {@code defaultUrl}. + *

* * @param projection the Coordinate Reference System authority code to use, parsed with * {@link org.geotools.referencing.CRS#decode(String)} * @param sourceName string to use in stats and logs to identify this stage - * @param basePath path to the directory containing shapefiles to process + * @param basePath path to the directory or zip file containing shapefiles to process * @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}. + * @param defaultUrl remote URL that the file to download if {@code download=true} argument is set and + * {@code name_url} argument is not set * @return this runner instance for chaining * @see ShapefileReader */ - public Planetiler addShapefileDirectorySource(String projection, String sourceName, Path basePath, - String globPattern) { - Path dirPath = getPath(sourceName, "shapefile directory", basePath, null); - PathMatcher matcher = dirPath.getFileSystem().getPathMatcher("glob:" + globPattern); + public Planetiler addShapefileGlobSource(String projection, String sourceName, Path basePath, + String globPattern, String defaultUrl) { + Path dirPath = getPath(sourceName, "shapefile glob", basePath, defaultUrl); return addStage(sourceName, "Process all files matching " + dirPath + "/" + globPattern, ifSourceUsed(sourceName, () -> { - try ( - var walk = Files.walk(dirPath); - var sourcePaths = walk.filter(path -> matcher.matches(path.getFileName())) - ) { - ShapefileReader.processWithProjection(projection, sourceName, sourcePaths.toList(), featureGroup, config, - profile, stats); - } + var sourcePaths = FileUtils.walkPathWithPattern(basePath, globPattern, + zipPath -> FileUtils.walkPathWithPattern(zipPath, "*.shp")); + ShapefileReader.processWithProjection(projection, sourceName, sourcePaths, featureGroup, config, + profile, stats); })); } - /** - * Adds a new ESRI shapefile directory source that will process all files under {@param basePath} matching - * {@param globPattern}. - * - * @param sourceName string to use in stats and logs to identify this stage - * @param basePath path to the directory containing shapefiles to process - * @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}. - * @return this runner instance for chaining - * @see ShapefileReader - */ - public Planetiler addShapefileDirectorySource(String sourceName, Path basePath, String globPattern) { - return addShapefileDirectorySource(null, sourceName, basePath, globPattern); - } - /** * Adds a new ESRI shapefile source that will be processed with an explicit projection when {@link #run()} is called. @@ -320,9 +324,14 @@ public class Planetiler { public Planetiler addShapefileSource(String projection, String name, Path defaultPath, String defaultUrl) { Path path = getPath(name, "shapefile", defaultPath, defaultUrl); return addStage(name, "Process features in " + path, - ifSourceUsed(name, - () -> ShapefileReader.processWithProjection(projection, name, List.of(path), featureGroup, config, profile, - stats))); + ifSourceUsed(name, () -> { + List sourcePaths = List.of(path); + if (FileUtils.hasExtension(path, "zip") || Files.isDirectory(path)) { + sourcePaths = FileUtils.walkPathWithPattern(path, "*.shp"); + } + + ShapefileReader.processWithProjection(projection, name, sourcePaths, featureGroup, config, profile, stats); + })); } /** diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/reader/ShapefileReader.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/reader/ShapefileReader.java index 4d62cb48..5b516247 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/reader/ShapefileReader.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/reader/ShapefileReader.java @@ -4,17 +4,12 @@ import com.onthegomap.planetiler.Profile; import com.onthegomap.planetiler.collection.FeatureGroup; import com.onthegomap.planetiler.config.PlanetilerConfig; import com.onthegomap.planetiler.stats.Stats; -import com.onthegomap.planetiler.util.FileUtils; import java.io.IOException; import java.io.UncheckedIOException; -import java.net.URI; -import java.nio.file.FileSystems; -import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; import java.util.List; import java.util.function.Consumer; -import java.util.stream.Stream; import org.geotools.data.FeatureSource; import org.geotools.data.shapefile.ShapefileDataStore; import org.geotools.feature.FeatureCollection; @@ -96,34 +91,9 @@ public class ShapefileReader extends SimpleReader { ); } - private static URI findShpFile(Path path, Stream walkStream) { - return walkStream - .filter(z -> FileUtils.hasExtension(z, "shp")) - .findFirst() - .orElseThrow(() -> new IllegalArgumentException("No .shp file found inside " + path)) - .toUri(); - } - private ShapefileDataStore open(Path path) { try { - URI uri; - if (Files.isDirectory(path)) { - try (var walkStream = Files.walk(path)) { - uri = findShpFile(path, walkStream); - } - } else if (FileUtils.hasExtension(path, "zip")) { - try ( - var zipFs = FileSystems.newFileSystem(path); - var walkStream = FileUtils.walkFileSystem(zipFs) - ) { - uri = findShpFile(path, walkStream); - } - } else if (FileUtils.hasExtension(path, "shp")) { - uri = path.toUri(); - } else { - throw new IllegalArgumentException("Invalid shapefile input: " + path + " must be zip or shp"); - } - var store = new ShapefileDataStore(uri.toURL()); + var store = new ShapefileDataStore(path.toUri().toURL()); store.setTryCPGFile(true); return store; } catch (IOException e) { diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java index 602563de..d3c3b4e9 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java @@ -5,12 +5,16 @@ import java.io.InputStream; import java.io.UncheckedIOException; import java.nio.file.FileStore; import java.nio.file.FileSystem; +import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.NoSuchFileException; import java.nio.file.Path; +import java.nio.file.PathMatcher; import java.nio.file.StandardOpenOption; import java.util.Comparator; +import java.util.List; import java.util.Objects; +import java.util.function.Function; import java.util.stream.Stream; import java.util.stream.StreamSupport; import java.util.zip.ZipEntry; @@ -22,6 +26,7 @@ import org.slf4j.LoggerFactory; * Convenience methods for working with files on disk. */ public class FileUtils { + private static final Format FORMAT = Format.defaultInstance(); // Prevent zip-bomb attack, see https://rules.sonarsource.com/java/RSPEC-5042 private static final int ZIP_THRESHOLD_ENTRIES = 10_000; @@ -45,6 +50,61 @@ public class FileUtils { }); } + /** + * Returns list of paths matching {@param pattern} within {@param basePath}. + *

+ * If {@param basePath} is a directory, then {@param walkZipFile} will be invoked for each matching {@code .zip} file + * found. This function should return paths of interest within the zip file. + * + * @param basePath file path to recursively walk, either a directory or ZIP archive. + * @param pattern pattern to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}. + * @param walkZipFile callback function to recurse into matching {@code .zip} files. + */ + public static List walkPathWithPattern(Path basePath, String pattern, + Function> walkZipFile) { + PathMatcher matcher = basePath.getFileSystem().getPathMatcher("glob:" + pattern); + + try { + if (FileUtils.hasExtension(basePath, "zip")) { + try ( + var zipFs = FileSystems.newFileSystem(basePath); + var walkStream = FileUtils.walkFileSystem(zipFs) + ) { + return walkStream + .filter(p -> p.getFileName() != null && matcher.matches(p.getFileName())) + .toList(); + } + } else if (Files.isDirectory(basePath)) { + try (var walk = Files.walk(basePath)) { + return walk + .filter(path -> matcher.matches(path.getFileName())) + .flatMap(path -> { + if (FileUtils.hasExtension(path, "zip")) { + return walkZipFile.apply(path).stream(); + } else { + return Stream.of(path); + } + }) + .toList(); + } + } else { + throw new IllegalArgumentException("No files matching " + basePath + "/" + pattern); + } + } catch (IOException exc) { + throw new UncheckedIOException(exc); + } + } + + /** + * Returns list of paths matching {@param pattern} within {@param basePath}. + * + * @param basePath file path to recursively walk, either a directory or ZIP archive. + * @param pattern pattern to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}. + */ + public static List walkPathWithPattern(Path basePath, String pattern) { + return walkPathWithPattern(basePath, pattern, zipPath -> List.of(zipPath)); + } + /** Returns true if {@code path} ends with ".extension" (case-insensitive). */ public static boolean hasExtension(Path path, String extension) { return path.toString().toLowerCase().endsWith("." + extension.toLowerCase()); diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java index 2ee843ff..faf435b7 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java @@ -1700,14 +1700,17 @@ class PlanetilerTests { .setAttr("source", source.getSource()); } }) - .addShapefileDirectorySource("shapefile-dir", resourceDir, "shape*.zip") + // Match *.shp within [shapefile.zip, shapefile-copy.zip] + .addShapefileGlobSource("shapefile-glob", resourceDir, "shape*.zip") + // Match *.shp within shapefile.zip + .addShapefileGlobSource("shapefile-glob-zip", resourceDir.resolve("shapefile.zip"), "*.shp") + // Match *.shp within shapefile.zip .addShapefileSource("shapefile", resourceDir.resolve("shapefile.zip")) .setOutput("mbtiles", mbtiles) .run(); try (Mbtiles db = Mbtiles.newReadOnlyDatabase(mbtiles)) { - long fileCount = 0; - long dirCount = 0; + long fileCount = 0, globCount = 0, globZipCount = 0; var tileMap = TestUtils.getTileMap(db); for (var tile : tileMap.values()) { for (var feature : tile) { @@ -1715,15 +1718,17 @@ class PlanetilerTests { switch ((String) feature.attrs().get("source")) { case "shapefile" -> fileCount++; - case "shapefile-dir" -> dirCount++; + case "shapefile-glob" -> globCount++; + case "shapefile-glob-zip" -> globZipCount++; } } } - // Input file was copied twice into test directory, directory source should have - // 2x the number of features. assertTrue(fileCount > 0); - assertEquals(2 * fileCount, dirCount); + // `shapefile` and `shapefile-glob-zip` both match only one file. + assertEquals(fileCount, globZipCount); + // `shapefile-glob` matches two input files, should have 2x number of features of `shapefile`. + assertEquals(2 * fileCount, globCount); } } diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/ShapefileReaderTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/ShapefileReaderTest.java index 7008b327..47e66a3c 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/ShapefileReaderTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/ShapefileReaderTest.java @@ -27,18 +27,11 @@ class ShapefileReaderTest { @TempDir private Path tempDir; - @Test - @Timeout(30) - void testReadShapefile() { - testReadShapefile(TestUtils.pathToResource("shapefile.zip")); - } - @Test @Timeout(30) @DisabledOnOs(OS.WINDOWS) // the zip file doesn't fully close, which causes trouble running test on windows void testReadShapefileExtracted() throws IOException { var extracted = TestUtils.extractPathToResource(tempDir, "shapefile.zip"); - testReadShapefile(extracted); try (var fs = FileSystems.newFileSystem(extracted)) { var path = fs.getPath("shapefile", "stations.shp"); testReadShapefile(path); @@ -50,7 +43,6 @@ class ShapefileReaderTest { void testReadShapefileUnzipped() throws IOException { var dest = tempDir.resolve("shapefile.zip"); FileUtils.unzipResource("/shapefile.zip", dest); - testReadShapefile(dest); testReadShapefile(dest.resolve("shapefile").resolve("stations.shp")); } diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java index 63babeed..8a94fc13 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java @@ -4,11 +4,15 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; +import com.onthegomap.planetiler.TestUtils; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.List; import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -85,4 +89,57 @@ class FileUtilsTest { Files.readString(dest.resolve("shapefile").resolve("stations.cpg")) ); } + + @Test + void testWalkPathWithPatternDirectory() throws IOException { + Path parent = tmpDir.resolve(Path.of("a", "b", "c")); + FileUtils.createDirectory(parent); + + List txtFiles = Stream.of("1.txt", "2.txt").map(parent::resolve).toList(); + + for (var file : txtFiles) { + Files.write(file, new byte[]{}); + } + + Files.write(parent.resolve("something-that-doesnt-match.blah"), new byte[]{}); + + var matchingPaths = FileUtils.walkPathWithPattern(parent, "*.txt"); + + assertEquals( + txtFiles.stream().sorted().toList(), + matchingPaths.stream().sorted().toList() + ); + } + + @Test + void testWalkPathWithPatternDirectoryZip() throws IOException { + Path parent = tmpDir.resolve(Path.of("a", "b", "c")); + FileUtils.createDirectory(parent); + + Path zipFile = parent.resolve("fake-zip-file.zip"); + + Files.write(zipFile, new byte[]{}); + Files.write(parent.resolve("something-that-doesnt-match.blah"), new byte[]{}); + + Function> mockWalkZipFile = zipPath -> List.of(zipPath.resolve("inner.txt")); + + // When we don't provide a callback to recurse into zip files, the path to the zip + // itself should be returned. + assertEquals(List.of(zipFile), FileUtils.walkPathWithPattern(parent, "*.zip")); + + // Otherwise, the files inside the zip should be returned. + assertEquals(List.of(zipFile.resolve("inner.txt")), + FileUtils.walkPathWithPattern(parent, "*.zip", mockWalkZipFile)); + } + + @Test + void testWalkPathWithPatternSingleZip() { + Path zipPath = TestUtils.pathToResource("shapefile.zip"); + + var matchingPaths = FileUtils.walkPathWithPattern(zipPath, "stations.sh[px]"); + + assertEquals( + List.of("/shapefile/stations.shp", "/shapefile/stations.shx"), + matchingPaths.stream().map(Path::toString).sorted().toList()); + } }