Extract ZIP file walker out of ShapefileReader. (#423)

pull/427/head
Erik Price 2023-01-01 14:29:00 -08:00 zatwierdzone przez GitHub
rodzic 7adf46819a
commit 184f950051
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
6 zmienionych plików z 171 dodań i 78 usunięć

Wyświetl plik

@ -29,7 +29,6 @@ import java.io.IOException;
import java.nio.file.FileSystem;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
@ -253,48 +252,53 @@ public class Planetiler {
}
/**
* Adds a new ESRI shapefile directory source that will process all files under {@param basePath} matching
* {@param globPattern} using an explicit projection.
* Adds a new ESRI shapefile glob source that will process all files under {@param basePath} matching
* {@param globPattern}. {@param basePath} may be a directory or ZIP archive.
*
* @param sourceName string to use in stats and logs to identify this stage
* @param basePath path to the directory containing shapefiles to process
* @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
* @return this runner instance for chaining
* @see ShapefileReader
*/
public Planetiler addShapefileGlobSource(String sourceName, Path basePath, String globPattern) {
return addShapefileGlobSource(null, sourceName, basePath, globPattern, null);
}
/**
* Adds a new ESRI shapefile glob source that will process all files under {@param basePath} matching
* {@param globPattern} using an explicit projection. {@param basePath} may be a directory or ZIP archive.
* <p>
* If {@param globPattern} matches a ZIP archive, all files ending in {@code .shp} within the archive will be used for
* this source.
* <p>
* If the file does not exist and {@code download=true} argument is set, then the file will first be downloaded from
* {@code defaultUrl}.
* <p>
*
* @param projection the Coordinate Reference System authority code to use, parsed with
* {@link org.geotools.referencing.CRS#decode(String)}
* @param sourceName string to use in stats and logs to identify this stage
* @param basePath path to the directory containing shapefiles to process
* @param basePath path to the directory or zip file containing shapefiles to process
* @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
* @param defaultUrl remote URL that the file to download if {@code download=true} argument is set and
* {@code name_url} argument is not set
* @return this runner instance for chaining
* @see ShapefileReader
*/
public Planetiler addShapefileDirectorySource(String projection, String sourceName, Path basePath,
String globPattern) {
Path dirPath = getPath(sourceName, "shapefile directory", basePath, null);
PathMatcher matcher = dirPath.getFileSystem().getPathMatcher("glob:" + globPattern);
public Planetiler addShapefileGlobSource(String projection, String sourceName, Path basePath,
String globPattern, String defaultUrl) {
Path dirPath = getPath(sourceName, "shapefile glob", basePath, defaultUrl);
return addStage(sourceName, "Process all files matching " + dirPath + "/" + globPattern,
ifSourceUsed(sourceName, () -> {
try (
var walk = Files.walk(dirPath);
var sourcePaths = walk.filter(path -> matcher.matches(path.getFileName()))
) {
ShapefileReader.processWithProjection(projection, sourceName, sourcePaths.toList(), featureGroup, config,
profile, stats);
}
var sourcePaths = FileUtils.walkPathWithPattern(basePath, globPattern,
zipPath -> FileUtils.walkPathWithPattern(zipPath, "*.shp"));
ShapefileReader.processWithProjection(projection, sourceName, sourcePaths, featureGroup, config,
profile, stats);
}));
}
/**
* Adds a new ESRI shapefile directory source that will process all files under {@param basePath} matching
* {@param globPattern}.
*
* @param sourceName string to use in stats and logs to identify this stage
* @param basePath path to the directory containing shapefiles to process
* @param globPattern string to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
* @return this runner instance for chaining
* @see ShapefileReader
*/
public Planetiler addShapefileDirectorySource(String sourceName, Path basePath, String globPattern) {
return addShapefileDirectorySource(null, sourceName, basePath, globPattern);
}
/**
* Adds a new ESRI shapefile source that will be processed with an explicit projection when {@link #run()} is called.
@ -320,9 +324,14 @@ public class Planetiler {
public Planetiler addShapefileSource(String projection, String name, Path defaultPath, String defaultUrl) {
Path path = getPath(name, "shapefile", defaultPath, defaultUrl);
return addStage(name, "Process features in " + path,
ifSourceUsed(name,
() -> ShapefileReader.processWithProjection(projection, name, List.of(path), featureGroup, config, profile,
stats)));
ifSourceUsed(name, () -> {
List<Path> sourcePaths = List.of(path);
if (FileUtils.hasExtension(path, "zip") || Files.isDirectory(path)) {
sourcePaths = FileUtils.walkPathWithPattern(path, "*.shp");
}
ShapefileReader.processWithProjection(projection, name, sourcePaths, featureGroup, config, profile, stats);
}));
}
/**

Wyświetl plik

@ -4,17 +4,12 @@ import com.onthegomap.planetiler.Profile;
import com.onthegomap.planetiler.collection.FeatureGroup;
import com.onthegomap.planetiler.config.PlanetilerConfig;
import com.onthegomap.planetiler.stats.Stats;
import com.onthegomap.planetiler.util.FileUtils;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.net.URI;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.function.Consumer;
import java.util.stream.Stream;
import org.geotools.data.FeatureSource;
import org.geotools.data.shapefile.ShapefileDataStore;
import org.geotools.feature.FeatureCollection;
@ -96,34 +91,9 @@ public class ShapefileReader extends SimpleReader<SimpleFeature> {
);
}
private static URI findShpFile(Path path, Stream<Path> walkStream) {
return walkStream
.filter(z -> FileUtils.hasExtension(z, "shp"))
.findFirst()
.orElseThrow(() -> new IllegalArgumentException("No .shp file found inside " + path))
.toUri();
}
private ShapefileDataStore open(Path path) {
try {
URI uri;
if (Files.isDirectory(path)) {
try (var walkStream = Files.walk(path)) {
uri = findShpFile(path, walkStream);
}
} else if (FileUtils.hasExtension(path, "zip")) {
try (
var zipFs = FileSystems.newFileSystem(path);
var walkStream = FileUtils.walkFileSystem(zipFs)
) {
uri = findShpFile(path, walkStream);
}
} else if (FileUtils.hasExtension(path, "shp")) {
uri = path.toUri();
} else {
throw new IllegalArgumentException("Invalid shapefile input: " + path + " must be zip or shp");
}
var store = new ShapefileDataStore(uri.toURL());
var store = new ShapefileDataStore(path.toUri().toURL());
store.setTryCPGFile(true);
return store;
} catch (IOException e) {

Wyświetl plik

@ -5,12 +5,16 @@ import java.io.InputStream;
import java.io.UncheckedIOException;
import java.nio.file.FileStore;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.StandardOpenOption;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import java.util.zip.ZipEntry;
@ -22,6 +26,7 @@ import org.slf4j.LoggerFactory;
* Convenience methods for working with files on disk.
*/
public class FileUtils {
private static final Format FORMAT = Format.defaultInstance();
// Prevent zip-bomb attack, see https://rules.sonarsource.com/java/RSPEC-5042
private static final int ZIP_THRESHOLD_ENTRIES = 10_000;
@ -45,6 +50,61 @@ public class FileUtils {
});
}
/**
* Returns list of paths matching {@param pattern} within {@param basePath}.
* <p>
* If {@param basePath} is a directory, then {@param walkZipFile} will be invoked for each matching {@code .zip} file
* found. This function should return paths of interest within the zip file.
*
* @param basePath file path to recursively walk, either a directory or ZIP archive.
* @param pattern pattern to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
* @param walkZipFile callback function to recurse into matching {@code .zip} files.
*/
public static List<Path> walkPathWithPattern(Path basePath, String pattern,
Function<Path, List<Path>> walkZipFile) {
PathMatcher matcher = basePath.getFileSystem().getPathMatcher("glob:" + pattern);
try {
if (FileUtils.hasExtension(basePath, "zip")) {
try (
var zipFs = FileSystems.newFileSystem(basePath);
var walkStream = FileUtils.walkFileSystem(zipFs)
) {
return walkStream
.filter(p -> p.getFileName() != null && matcher.matches(p.getFileName()))
.toList();
}
} else if (Files.isDirectory(basePath)) {
try (var walk = Files.walk(basePath)) {
return walk
.filter(path -> matcher.matches(path.getFileName()))
.flatMap(path -> {
if (FileUtils.hasExtension(path, "zip")) {
return walkZipFile.apply(path).stream();
} else {
return Stream.of(path);
}
})
.toList();
}
} else {
throw new IllegalArgumentException("No files matching " + basePath + "/" + pattern);
}
} catch (IOException exc) {
throw new UncheckedIOException(exc);
}
}
/**
* Returns list of paths matching {@param pattern} within {@param basePath}.
*
* @param basePath file path to recursively walk, either a directory or ZIP archive.
* @param pattern pattern to match filenames against, as described in {@link FileSystem#getPathMatcher(String)}.
*/
public static List<Path> walkPathWithPattern(Path basePath, String pattern) {
return walkPathWithPattern(basePath, pattern, zipPath -> List.of(zipPath));
}
/** Returns true if {@code path} ends with ".extension" (case-insensitive). */
public static boolean hasExtension(Path path, String extension) {
return path.toString().toLowerCase().endsWith("." + extension.toLowerCase());

Wyświetl plik

@ -1700,14 +1700,17 @@ class PlanetilerTests {
.setAttr("source", source.getSource());
}
})
.addShapefileDirectorySource("shapefile-dir", resourceDir, "shape*.zip")
// Match *.shp within [shapefile.zip, shapefile-copy.zip]
.addShapefileGlobSource("shapefile-glob", resourceDir, "shape*.zip")
// Match *.shp within shapefile.zip
.addShapefileGlobSource("shapefile-glob-zip", resourceDir.resolve("shapefile.zip"), "*.shp")
// Match *.shp within shapefile.zip
.addShapefileSource("shapefile", resourceDir.resolve("shapefile.zip"))
.setOutput("mbtiles", mbtiles)
.run();
try (Mbtiles db = Mbtiles.newReadOnlyDatabase(mbtiles)) {
long fileCount = 0;
long dirCount = 0;
long fileCount = 0, globCount = 0, globZipCount = 0;
var tileMap = TestUtils.getTileMap(db);
for (var tile : tileMap.values()) {
for (var feature : tile) {
@ -1715,15 +1718,17 @@ class PlanetilerTests {
switch ((String) feature.attrs().get("source")) {
case "shapefile" -> fileCount++;
case "shapefile-dir" -> dirCount++;
case "shapefile-glob" -> globCount++;
case "shapefile-glob-zip" -> globZipCount++;
}
}
}
// Input file was copied twice into test directory, directory source should have
// 2x the number of features.
assertTrue(fileCount > 0);
assertEquals(2 * fileCount, dirCount);
// `shapefile` and `shapefile-glob-zip` both match only one file.
assertEquals(fileCount, globZipCount);
// `shapefile-glob` matches two input files, should have 2x number of features of `shapefile`.
assertEquals(2 * fileCount, globCount);
}
}

Wyświetl plik

@ -27,18 +27,11 @@ class ShapefileReaderTest {
@TempDir
private Path tempDir;
@Test
@Timeout(30)
void testReadShapefile() {
testReadShapefile(TestUtils.pathToResource("shapefile.zip"));
}
@Test
@Timeout(30)
@DisabledOnOs(OS.WINDOWS) // the zip file doesn't fully close, which causes trouble running test on windows
void testReadShapefileExtracted() throws IOException {
var extracted = TestUtils.extractPathToResource(tempDir, "shapefile.zip");
testReadShapefile(extracted);
try (var fs = FileSystems.newFileSystem(extracted)) {
var path = fs.getPath("shapefile", "stations.shp");
testReadShapefile(path);
@ -50,7 +43,6 @@ class ShapefileReaderTest {
void testReadShapefileUnzipped() throws IOException {
var dest = tempDir.resolve("shapefile.zip");
FileUtils.unzipResource("/shapefile.zip", dest);
testReadShapefile(dest);
testReadShapefile(dest.resolve("shapefile").resolve("stations.shp"));
}

Wyświetl plik

@ -4,11 +4,15 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import com.onthegomap.planetiler.TestUtils;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
@ -85,4 +89,57 @@ class FileUtilsTest {
Files.readString(dest.resolve("shapefile").resolve("stations.cpg"))
);
}
@Test
void testWalkPathWithPatternDirectory() throws IOException {
Path parent = tmpDir.resolve(Path.of("a", "b", "c"));
FileUtils.createDirectory(parent);
List<Path> txtFiles = Stream.of("1.txt", "2.txt").map(parent::resolve).toList();
for (var file : txtFiles) {
Files.write(file, new byte[]{});
}
Files.write(parent.resolve("something-that-doesnt-match.blah"), new byte[]{});
var matchingPaths = FileUtils.walkPathWithPattern(parent, "*.txt");
assertEquals(
txtFiles.stream().sorted().toList(),
matchingPaths.stream().sorted().toList()
);
}
@Test
void testWalkPathWithPatternDirectoryZip() throws IOException {
Path parent = tmpDir.resolve(Path.of("a", "b", "c"));
FileUtils.createDirectory(parent);
Path zipFile = parent.resolve("fake-zip-file.zip");
Files.write(zipFile, new byte[]{});
Files.write(parent.resolve("something-that-doesnt-match.blah"), new byte[]{});
Function<Path, List<Path>> mockWalkZipFile = zipPath -> List.of(zipPath.resolve("inner.txt"));
// When we don't provide a callback to recurse into zip files, the path to the zip
// itself should be returned.
assertEquals(List.of(zipFile), FileUtils.walkPathWithPattern(parent, "*.zip"));
// Otherwise, the files inside the zip should be returned.
assertEquals(List.of(zipFile.resolve("inner.txt")),
FileUtils.walkPathWithPattern(parent, "*.zip", mockWalkZipFile));
}
@Test
void testWalkPathWithPatternSingleZip() {
Path zipPath = TestUtils.pathToResource("shapefile.zip");
var matchingPaths = FileUtils.walkPathWithPattern(zipPath, "stations.sh[px]");
assertEquals(
List.of("/shapefile/stations.shp", "/shapefile/stations.shx"),
matchingPaths.stream().map(Path::toString).sorted().toList());
}
}