From d9a5c8f38765812e93ffc9a28480423e6e81c95e Mon Sep 17 00:00:00 2001 From: Angus Gratton Date: Thu, 16 May 2019 10:36:48 +0800 Subject: [PATCH 1/3] esp32: Use FPU for floating point divide, power, complex multiplications * Linker was choosing ROM symbols for these, which use integer soft-float operations and are much slower. * _divsf3() moved to IRAM to avoid regressions with any code that does integer float division in IRAM interrupt handlers (+88 bytes IRAM) * Thanks to michal for reporting: https://esp32.com/viewtopic.php?f=14&t=10540&p=43367 --- components/esp32/linker.lf | 1 + components/esp_rom/esp32/ld/esp32.rom.libgcc.ld | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/components/esp32/linker.lf b/components/esp32/linker.lf index b7bb89f6e6..db73d2dc9f 100644 --- a/components/esp32/linker.lf +++ b/components/esp32/linker.lf @@ -7,6 +7,7 @@ entries: archive: libgcc.a entries: lib2funcs (noflash_text) + _divsf3 (noflash) [mapping:gcov] archive: libgcov.a diff --git a/components/esp_rom/esp32/ld/esp32.rom.libgcc.ld b/components/esp_rom/esp32/ld/esp32.rom.libgcc.ld index dfdbe25881..0bda115dc6 100644 --- a/components/esp_rom/esp32/ld/esp32.rom.libgcc.ld +++ b/components/esp_rom/esp32/ld/esp32.rom.libgcc.ld @@ -24,8 +24,6 @@ __ctzsi2 = 0x4000c7f0; __divdc3 = 0x400645a4; __divdf3 = 0x40002954; __divdi3 = 0x4000ca84; -__divsc3 = 0x4006429c; -__divsf3 = 0x4000234c; __divsi3 = 0x4000c7b8; __eqdf2 = 0x400636a8; __eqsf2 = 0x40063374; @@ -62,7 +60,6 @@ __modsi3 = 0x4000c7c0; __muldc3 = 0x40063c90; __muldf3 = 0x4006358c; __muldi3 = 0x4000c9fc; -__mulsc3 = 0x40063944; __mulsf3 = 0x400632c8; __mulsi3 = 0x4000c7b0; __mulvdi3 = 0x40002d78; @@ -80,7 +77,6 @@ __popcount_tab = 0x3ff96544; __popcountdi2 = 0x40002ef8; __popcountsi2 = 0x40002ed0; __powidf2 = 0x400638e4; -__powisf2 = 0x4006389c; __subdf3 = 0x400026e4; __subsf3 = 0x400021d0; __subvdi3 = 0x40002d20; From c7a0d5e06345819ca818f3e9abcaea182b3dddc5 Mon Sep 17 00:00:00 2001 From: Angus Gratton Date: Thu, 16 May 2019 10:43:25 +0800 Subject: [PATCH 2/3] Fix bug in the floating point unit test code --- components/esp32/test/test_fp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/components/esp32/test/test_fp.c b/components/esp32/test/test_fp.c index 180999f9bc..8f08ed9c46 100644 --- a/components/esp32/test/test_fp.c +++ b/components/esp32/test/test_fp.c @@ -4,6 +4,10 @@ #include "freertos/task.h" #include "unity.h" +/* Note: these functions are included here for unit test purposes. They are not needed for writing + * normal code. If writing standard C floating point code, libgcc should correctly include implementations + * that use the floating point registers correctly. */ + static float addsf(float a, float b) { float result; @@ -48,7 +52,7 @@ static float divsf(float a, float b) "const.s f2, 0 \n" "neg.s f9, f8 \n" "maddn.s f5,f4,f6 \n" - "maddn.s f2, f0, f3 \n" + "maddn.s f2, f9, f3 \n" "mkdadj.s f7, f0 \n" "maddn.s f6,f5,f6 \n" "maddn.s f9,f4,f2 \n" From 0b70dfc27f98eb701383a75b32d9dda03fe75462 Mon Sep 17 00:00:00 2001 From: Angus Gratton Date: Thu, 16 May 2019 11:19:32 +0800 Subject: [PATCH 3/3] Add floating point performance test --- components/esp32/test/test_fp.c | 69 +++++++++++++++++++ components/idf_test/include/idf_performance.h | 3 + 2 files changed, 72 insertions(+) diff --git a/components/esp32/test/test_fp.c b/components/esp32/test/test_fp.c index 8f08ed9c46..c9454107a6 100644 --- a/components/esp32/test/test_fp.c +++ b/components/esp32/test/test_fp.c @@ -1,8 +1,10 @@ #include #include +#include "soc/cpu.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include "unity.h" +#include "test_utils.h" /* Note: these functions are included here for unit test purposes. They are not needed for writing * normal code. If writing standard C floating point code, libgcc should correctly include implementations @@ -195,3 +197,70 @@ TEST_CASE("context switch saves FP registers", "[fp]") } TEST_ASSERT(state.fail == 0); } + +/* Note: not static, to avoid optimisation of const result */ +float IRAM_ATTR test_fp_benchmark_fp_divide(int counts, unsigned *cycles) +{ + float f = MAXFLOAT; + uint32_t before, after; + RSR(CCOUNT, before); + + for (int i = 0; i < counts; i++) { + f /= 1.000432f; + } + + RSR(CCOUNT, after); + *cycles = (after - before) / counts; + + return f; +} + +TEST_CASE("floating point division performance", "[fp]") +{ + const unsigned COUNTS = 1000; + unsigned cycles = 0; + + // initialize fpu + volatile __attribute__((unused)) float dummy = sqrtf(rand()); + + float f = test_fp_benchmark_fp_divide(COUNTS, &cycles); + + printf("%d divisions from %f = %f\n", COUNTS, MAXFLOAT, f); + printf("Per division = %d cycles\n", cycles); + + TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_DIV, "%d cycles", cycles); +} + +/* Note: not static, to avoid optimisation of const result */ +float IRAM_ATTR test_fp_benchmark_fp_sqrt(int counts, unsigned *cycles) +{ + float f = MAXFLOAT; + uint32_t before, after; + RSR(CCOUNT, before); + + for (int i = 0; i < counts; i++) { + f = sqrtf(f); + } + + RSR(CCOUNT, after); + *cycles = (after - before) / counts; + + return f; +} + +TEST_CASE("floating point square root performance", "[fp]") +{ + const unsigned COUNTS = 200; + unsigned cycles = 0; + + // initialize fpu + volatile float __attribute__((unused)) dummy = sqrtf(rand()); + + float f = test_fp_benchmark_fp_sqrt(COUNTS, &cycles); + + printf("%d square roots from %f = %f\n", COUNTS, MAXFLOAT, f); + printf("Per sqrt = %d cycles\n", cycles); + + TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_SQRT, "%d cycles", cycles); +} + diff --git a/components/idf_test/include/idf_performance.h b/components/idf_test/include/idf_performance.h index 55809e90ae..6cdb288236 100644 --- a/components/idf_test/include/idf_performance.h +++ b/components/idf_test/include/idf_performance.h @@ -27,4 +27,7 @@ #define IDF_PERFORMANCE_MAX_ESP32_TIME_SHA512_32KB 4500 // AES-CBC hardware throughput (accounts for worst-case performance with PSRAM workaround) #define IDF_PERFORMANCE_MIN_AES_CBC_THROUGHPUT_MBSEC 8.5 +// floating point instructions per divide and per sqrt (configured for worst-case with PSRAM workaround) +#define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_DIV 70 +#define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_SQRT 140