kopia lustrzana https://github.com/espressif/esp-idf
Merge branch 'bugfix/libgcc_fpu_functions' into 'master'
esp32: Use FPU for floating point divide, power, complex multiplications See merge request idf/esp-idf!5005pull/3589/head
commit
db6a30b446
|
@ -7,6 +7,7 @@ entries:
|
||||||
archive: libgcc.a
|
archive: libgcc.a
|
||||||
entries:
|
entries:
|
||||||
lib2funcs (noflash_text)
|
lib2funcs (noflash_text)
|
||||||
|
_divsf3 (noflash)
|
||||||
|
|
||||||
[mapping:gcov]
|
[mapping:gcov]
|
||||||
archive: libgcov.a
|
archive: libgcov.a
|
||||||
|
|
|
@ -1,8 +1,14 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include "soc/cpu.h"
|
||||||
#include "freertos/FreeRTOS.h"
|
#include "freertos/FreeRTOS.h"
|
||||||
#include "freertos/task.h"
|
#include "freertos/task.h"
|
||||||
#include "unity.h"
|
#include "unity.h"
|
||||||
|
#include "test_utils.h"
|
||||||
|
|
||||||
|
/* Note: these functions are included here for unit test purposes. They are not needed for writing
|
||||||
|
* normal code. If writing standard C floating point code, libgcc should correctly include implementations
|
||||||
|
* that use the floating point registers correctly. */
|
||||||
|
|
||||||
static float addsf(float a, float b)
|
static float addsf(float a, float b)
|
||||||
{
|
{
|
||||||
|
@ -48,7 +54,7 @@ static float divsf(float a, float b)
|
||||||
"const.s f2, 0 \n"
|
"const.s f2, 0 \n"
|
||||||
"neg.s f9, f8 \n"
|
"neg.s f9, f8 \n"
|
||||||
"maddn.s f5,f4,f6 \n"
|
"maddn.s f5,f4,f6 \n"
|
||||||
"maddn.s f2, f0, f3 \n"
|
"maddn.s f2, f9, f3 \n"
|
||||||
"mkdadj.s f7, f0 \n"
|
"mkdadj.s f7, f0 \n"
|
||||||
"maddn.s f6,f5,f6 \n"
|
"maddn.s f6,f5,f6 \n"
|
||||||
"maddn.s f9,f4,f2 \n"
|
"maddn.s f9,f4,f2 \n"
|
||||||
|
@ -191,3 +197,70 @@ TEST_CASE("context switch saves FP registers", "[fp]")
|
||||||
}
|
}
|
||||||
TEST_ASSERT(state.fail == 0);
|
TEST_ASSERT(state.fail == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Note: not static, to avoid optimisation of const result */
|
||||||
|
float IRAM_ATTR test_fp_benchmark_fp_divide(int counts, unsigned *cycles)
|
||||||
|
{
|
||||||
|
float f = MAXFLOAT;
|
||||||
|
uint32_t before, after;
|
||||||
|
RSR(CCOUNT, before);
|
||||||
|
|
||||||
|
for (int i = 0; i < counts; i++) {
|
||||||
|
f /= 1.000432f;
|
||||||
|
}
|
||||||
|
|
||||||
|
RSR(CCOUNT, after);
|
||||||
|
*cycles = (after - before) / counts;
|
||||||
|
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CASE("floating point division performance", "[fp]")
|
||||||
|
{
|
||||||
|
const unsigned COUNTS = 1000;
|
||||||
|
unsigned cycles = 0;
|
||||||
|
|
||||||
|
// initialize fpu
|
||||||
|
volatile __attribute__((unused)) float dummy = sqrtf(rand());
|
||||||
|
|
||||||
|
float f = test_fp_benchmark_fp_divide(COUNTS, &cycles);
|
||||||
|
|
||||||
|
printf("%d divisions from %f = %f\n", COUNTS, MAXFLOAT, f);
|
||||||
|
printf("Per division = %d cycles\n", cycles);
|
||||||
|
|
||||||
|
TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_DIV, "%d cycles", cycles);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Note: not static, to avoid optimisation of const result */
|
||||||
|
float IRAM_ATTR test_fp_benchmark_fp_sqrt(int counts, unsigned *cycles)
|
||||||
|
{
|
||||||
|
float f = MAXFLOAT;
|
||||||
|
uint32_t before, after;
|
||||||
|
RSR(CCOUNT, before);
|
||||||
|
|
||||||
|
for (int i = 0; i < counts; i++) {
|
||||||
|
f = sqrtf(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
RSR(CCOUNT, after);
|
||||||
|
*cycles = (after - before) / counts;
|
||||||
|
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CASE("floating point square root performance", "[fp]")
|
||||||
|
{
|
||||||
|
const unsigned COUNTS = 200;
|
||||||
|
unsigned cycles = 0;
|
||||||
|
|
||||||
|
// initialize fpu
|
||||||
|
volatile float __attribute__((unused)) dummy = sqrtf(rand());
|
||||||
|
|
||||||
|
float f = test_fp_benchmark_fp_sqrt(COUNTS, &cycles);
|
||||||
|
|
||||||
|
printf("%d square roots from %f = %f\n", COUNTS, MAXFLOAT, f);
|
||||||
|
printf("Per sqrt = %d cycles\n", cycles);
|
||||||
|
|
||||||
|
TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_SQRT, "%d cycles", cycles);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,8 +24,6 @@ __ctzsi2 = 0x4000c7f0;
|
||||||
__divdc3 = 0x400645a4;
|
__divdc3 = 0x400645a4;
|
||||||
__divdf3 = 0x40002954;
|
__divdf3 = 0x40002954;
|
||||||
__divdi3 = 0x4000ca84;
|
__divdi3 = 0x4000ca84;
|
||||||
__divsc3 = 0x4006429c;
|
|
||||||
__divsf3 = 0x4000234c;
|
|
||||||
__divsi3 = 0x4000c7b8;
|
__divsi3 = 0x4000c7b8;
|
||||||
__eqdf2 = 0x400636a8;
|
__eqdf2 = 0x400636a8;
|
||||||
__eqsf2 = 0x40063374;
|
__eqsf2 = 0x40063374;
|
||||||
|
@ -62,7 +60,6 @@ __modsi3 = 0x4000c7c0;
|
||||||
__muldc3 = 0x40063c90;
|
__muldc3 = 0x40063c90;
|
||||||
__muldf3 = 0x4006358c;
|
__muldf3 = 0x4006358c;
|
||||||
__muldi3 = 0x4000c9fc;
|
__muldi3 = 0x4000c9fc;
|
||||||
__mulsc3 = 0x40063944;
|
|
||||||
__mulsf3 = 0x400632c8;
|
__mulsf3 = 0x400632c8;
|
||||||
__mulsi3 = 0x4000c7b0;
|
__mulsi3 = 0x4000c7b0;
|
||||||
__mulvdi3 = 0x40002d78;
|
__mulvdi3 = 0x40002d78;
|
||||||
|
@ -80,7 +77,6 @@ __popcount_tab = 0x3ff96544;
|
||||||
__popcountdi2 = 0x40002ef8;
|
__popcountdi2 = 0x40002ef8;
|
||||||
__popcountsi2 = 0x40002ed0;
|
__popcountsi2 = 0x40002ed0;
|
||||||
__powidf2 = 0x400638e4;
|
__powidf2 = 0x400638e4;
|
||||||
__powisf2 = 0x4006389c;
|
|
||||||
__subdf3 = 0x400026e4;
|
__subdf3 = 0x400026e4;
|
||||||
__subsf3 = 0x400021d0;
|
__subsf3 = 0x400021d0;
|
||||||
__subvdi3 = 0x40002d20;
|
__subvdi3 = 0x40002d20;
|
||||||
|
|
|
@ -27,4 +27,7 @@
|
||||||
#define IDF_PERFORMANCE_MAX_ESP32_TIME_SHA512_32KB 4500
|
#define IDF_PERFORMANCE_MAX_ESP32_TIME_SHA512_32KB 4500
|
||||||
// AES-CBC hardware throughput (accounts for worst-case performance with PSRAM workaround)
|
// AES-CBC hardware throughput (accounts for worst-case performance with PSRAM workaround)
|
||||||
#define IDF_PERFORMANCE_MIN_AES_CBC_THROUGHPUT_MBSEC 8.5
|
#define IDF_PERFORMANCE_MIN_AES_CBC_THROUGHPUT_MBSEC 8.5
|
||||||
|
// floating point instructions per divide and per sqrt (configured for worst-case with PSRAM workaround)
|
||||||
|
#define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_DIV 70
|
||||||
|
#define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_SQRT 140
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue