Profiling tests: Large test data and wikidata caching (#855)

* Adds a script to downloads multiple areas and compute their test parameters * added a large test that uses a combined 76MB file with equatorial-guinea, liechtenstein, district-of-columbia, greater-london * cache wikidata downloads
2020-05-06 11:36:15 -04:00 · 2020-05-06 11:36:15 -04:00 · c35cc59bda
commit c35cc59bda
--- a/.github/workflows/build-test-data.sh
+++ b/.github/workflows/build-test-data.sh
@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+#
+# Download several areas, combine them into a single file, and print --bbox params needed to run test-perf
+#
+
+# List of Geofabrik areas
+TEST_AREAS=(equatorial-guinea liechtenstein district-of-columbia greater-london)
+
+: "${DATA_DIR:=/tileset/data/test}"
+: "${DATA_FILE_SUFFIX:=-latest.osm.pbf}"
+: "${RESULT_FILE:=test${DATA_FILE_SUFFIX}}"
+
+mkdir -p "$DATA_DIR"
+cd "$DATA_DIR"
+
+
+echo -e $"\n=========== downloading areas" "${TEST_AREAS[@]}" "==========================="
+for area in "${TEST_AREAS[@]}"; do
+  file="${area}${DATA_FILE_SUFFIX}"
+  if [ -f "$file" ]; then
+    echo "File $file already exists, skipping download"
+  else
+    download-osm geofabrik "${area}" -- -d "$DATA_DIR"
+    if [ ! -f "$file" ]; then
+      echo "Unexpected error while downloading $file, aborting"
+      exit 1
+    fi
+  fi
+done
+
+
+echo -e $"\n=========== Merging" "${TEST_AREAS[@]}" "into ${RESULT_FILE} ====="
+rm -f "${RESULT_FILE}"
+OSMOSIS_ARG="--read-pbf ${TEST_AREAS[0]}${DATA_FILE_SUFFIX} $(printf " --read-pbf %s${DATA_FILE_SUFFIX} --merge" "${TEST_AREAS[@]:1}")"
+# shellcheck disable=SC2086
+( set -x; osmosis ${OSMOSIS_ARG} --write-pbf "${RESULT_FILE}" )
+
+
+echo -e $"\n=========== Computing test BBOXes ======================="
+echo -e $"\n  File ${RESULT_FILE} ($(du -b "$RESULT_FILE" | cut -f1)) has been generated with these test areas:\n"
+for area in "${TEST_AREAS[@]}"; do
+  file="${area}${DATA_FILE_SUFFIX}"
+  STATS=$(osmconvert --out-statistics "$file" )
+  LON_MIN=$( echo "$STATS" | grep "lon min:" | cut -d":" -f 2 | awk '{gsub(/^ +| +$/,"")} {print $0}' )
+  LON_MAX=$( echo "$STATS" | grep "lon max:" | cut -d":" -f 2 | awk '{gsub(/^ +| +$/,"")} {print $0}' )
+  LAT_MIN=$( echo "$STATS" | grep "lat min:" | cut -d":" -f 2 | awk '{gsub(/^ +| +$/,"")} {print $0}' )
+  LAT_MAX=$( echo "$STATS" | grep "lat max:" | cut -d":" -f 2 | awk '{gsub(/^ +| +$/,"")} {print $0}' )
+  BBOX="${LON_MIN},${LAT_MIN},${LON_MAX},${LAT_MAX}"
+  FILE_SIZE="$(du -b "$file" | cut -f1)"
+
+  cat <<EOF | (PYTHONPATH=/usr/src/app python)
+from openmaptiles.perfutils import TestCase
+tc = TestCase('${area}', 'a', bbox='${BBOX}')
+info = f"# {tc.id} {tc.size():,} tiles at z14, \
+{$FILE_SIZE/1024/1024:,.1f}MB, {$FILE_SIZE/tc.size():,.1f} bytes/tile \
+[{tc.start[0]}/{tc.start[1]}]x[{tc.before[0] - 1}/{tc.before[1] - 1}]"
+print(f"  --bbox {tc.bbox:46} {info}")
+EOF
+done
+echo ""
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -47,21 +47,34 @@ jobs:
    # there is no point to run long perf test until we know the code is OK
    needs: integrity_test
    env:
-      # Smaller tests (runs everything in about 30 minutes)
-      # Two test areas:  equatorial-guinea  and  liechtenstein
-      TEST_PERF_PARAMS: "--minzoom 0 --maxzoom 14 --bbox 5.4172943,-1.6732196,12.3733400,4.3475256 --bbox 9.0900979,46.9688169,9.6717077,47.5258072"
-      TEST_DATA_URL: "https://drive.google.com/uc?export=download&id=12vw07f9W0MiAHIqMztRiIMwahJfqTi21"
+      ## Smaller tests (runs everything in about 30 minutes)
+      ## Two test areas:  equatorial-guinea  and  liechtenstein
+      #TEST_DATA_URL: "https://drive.google.com/uc?export=download&id=12vw07f9W0MiAHIqMztRiIMwahJfqTi21"
+      #TEST_PERF_PARAMS: "--minzoom 0 --maxzoom 14 --bbox 5.4172943,-1.6732196,12.3733400,4.3475256 --bbox 9.0900979,46.9688169,9.6717077,47.5258072"
+
+      # Large test, size is 79,472,850
+      #  --bbox 5.4172943,-1.6732196,12.3733400,4.3475256       `# equatorial-guinea 87,768 tiles at z14, 2.1MB, 24.9 bytes/tile [8438/7993]x[8755/8268]` \
+      #  --bbox 9.0900979,46.9688169,9.6717077,47.5258072       `# liechtenstein 1,064 tiles at z14, 2.2MB, 2,217.0 bytes/tile [8605/5727]x[8632/5764]` \
+      #  --bbox -78.7749754,38.7820235,-76.8957735,39.6985009   `# district-of-columbia 4,785 tiles at z14, 16.0MB, 3,508.9 bytes/tile [4606/6220]x[4692/6274]` \
+      #  --bbox -0.6124681,51.2268449,0.3996690,51.7873570      `# greater-london 1,974 tiles at z14, 55.5MB, 29,458.3 bytes/tile [8164/5427]x[8210/5468]` \
+      TEST_DATA_URL: "https://drive.google.com/uc?export=download&id=18nP3f06aBBiEKhUNmAkqq30gqQnU2_VJ"
+      TEST_PERF_PARAMS: >-
+        --minzoom 0 --maxzoom 14
+        --bbox 5.4172943,-1.6732196,12.3733400,4.3475256
+        --bbox 9.0900979,46.9688169,9.6717077,47.5258072
+        --bbox -78.7749754,38.7820235,-76.8957735,39.6985009
+        --bbox -0.6124681,51.2268449,0.3996690,51.7873570

      ## Large test data -- we should switch to it after everything is working ok
-      # TEST_PERF_PARAMS: "--minzoom 0 --maxzoom 14 --test hungary --test isle-of-man"
      # TEST_DATA_URL: "https://drive.google.com/uc?export=download&id=1kw7XPDPd1Rc-Zi2XxGLTXdinUSq-S4pT"
+      # TEST_PERF_PARAMS: "--minzoom 0 --maxzoom 14 --test hungary --test isle-of-man"
    steps:
      - name: Cache test data download
        id: cache-testdata
        uses: actions/cache@v1
        with:
          path: ci_cache
-          key: "${{ env.TEST_DATA_URL }}"
+          key: "v2-${{ env.TEST_DATA_URL }}"

      - name: Download test data on cache miss
        if: steps.cache-testdata.outputs.cache-hit != 'true'
@ -149,6 +162,10 @@ jobs:
            profile 1_data     make import-data
            profile 2_osm      make import-osm
            profile 3_borders  make import-borders
+
+            if [ -f ../ci_cache/wikidata-cache.json ]; then
+              cp ../ci_cache/wikidata-cache.json cache/wikidata-cache.json
+            fi
            profile 4_wikidata make import-wikidata
            profile 5_sql      make import-sql

@ -179,10 +196,11 @@ jobs:

            PROFILE_DIR=../perf_cache
            create_db
+            if [ ! -f ../ci_cache/wikidata-cache.json ]; then
+              cp cache/wikidata-cache.json ../ci_cache/wikidata-cache.json
+            fi

-            # Use latest tools version because these specific tests do not yet exist in the 4.1 tools version
-            # Custom TOOLS_VERSION can be removed once OMT master is migrated to the next tools version
-            TOOLS_VERSION=latest profile test-perf docker-compose run --rm -T openmaptiles-tools \
+            profile test-perf docker-compose run --rm -T openmaptiles-tools \
              test-perf openmaptiles.yaml $TEST_PERF_PARAMS \
              --record /tileset/results.json
            mv results.json ../perf_cache
@ -215,10 +233,8 @@ jobs:
            PROFILE_DIR=../artifacts
            create_db

-            # Use latest tools version because these specific tests do not yet exist in the 4.1 tools version
-            # Custom TOOLS_VERSION can be removed once OMT master is migrated to the next tools version
            cp ../perf_cache/results.json .
-            OUTPUT="$(TOOLS_VERSION=latest profile test-perf docker-compose run --rm -T openmaptiles-tools \
+            OUTPUT="$(profile test-perf docker-compose run --rm -T openmaptiles-tools \
                        test-perf openmaptiles.yaml $TEST_PERF_PARAMS \
                        --compare /tileset/results.json --record /tileset/pr-results.json)"
            rm results.json
--- a/4
+++ b/4
@ -346,3 +346,7 @@ docker-unnecessary-clean:
 .PHONY: test-perf-null
 test-perf-null:
 	$(DOCKER_COMPOSE) run $(DC_OPTS) openmaptiles-tools test-perf openmaptiles.yaml --test null --no-color
+
+.PHONY: build-test-pbf
+build-test-pbf:
+	docker-compose run $(DC_OPTS) openmaptiles-tools /tileset/.github/workflows/build-test-data.sh