gdma: set transfer ability

2021-06-23 14:10:07 +08:00 · 2021-06-23 14:10:07 +08:00 · d9819bc7ae
commit d9819bc7ae
--- a/components/driver/gdma.c
+++ b/components/driver/gdma.c
@ -74,6 +74,8 @@ struct gdma_channel_t {
    intr_handle_t intr; // per-channel interrupt handle
    gdma_channel_direction_t direction; // channel direction
    int periph_id; // Peripheral instance ID, indicates which peripheral is connected to this GDMA channel
+    size_t sram_alignment;  // alignment for memory in SRAM
+    size_t psram_alignment; // alignment for memory in PSRAM
    esp_err_t (*del)(gdma_channel_t *channel); // channel deletion function, it's polymorphic, see `gdma_del_tx_channel` or `gdma_del_rx_channel`
 };

@ -271,6 +273,67 @@ err:
    return ret;
 }

+esp_err_t gdma_set_transfer_ability(gdma_channel_handle_t dma_chan, const gdma_transfer_ability_t *ability)
+{
+    esp_err_t ret = ESP_OK;
+    gdma_pair_t *pair = NULL;
+    gdma_group_t *group = NULL;
+    bool en_burst = true;
+    ESP_GOTO_ON_FALSE(dma_chan, ESP_ERR_INVALID_ARG, err, TAG, "invalid argument");
+    pair = dma_chan->pair;
+    group = pair->group;
+    size_t sram_alignment = ability->sram_trans_align;
+    size_t psram_alignment = ability->psram_trans_align;
+    // alignment should be 2^n
+    ESP_GOTO_ON_FALSE((sram_alignment & (sram_alignment - 1)) == 0, ESP_ERR_INVALID_ARG, err, TAG, "invalid sram alignment: %zu", sram_alignment);
+
+#if SOC_GDMA_SUPPORT_PSRAM
+    int block_size_index = 0;
+    switch (psram_alignment) {
+    case 64: // 64 Bytes alignment
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_64B;
+        break;
+    case 32: // 32 Bytes alignment
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_32B;
+        break;
+    case 16: // 16 Bytes alignment
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_16B;
+        break;
+    case 0: // no alignment is requirement
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_16B;
+        psram_alignment = SOC_GDMA_PSRAM_MIN_ALIGN; // fall back to minimal alignment
+        break;
+    default:
+        ESP_GOTO_ON_FALSE(false, ESP_ERR_INVALID_ARG, err, TAG, "invalid psram alignment: %zu", psram_alignment);
+        break;
+    }
+#endif // #if SOC_GDMA_SUPPORT_PSRAM
+
+    if (dma_chan->direction == GDMA_CHANNEL_DIRECTION_TX) {
+        // TX channel can always enable burst mode, no matter data alignment
+        gdma_ll_tx_enable_data_burst(group->hal.dev, pair->pair_id, true);
+        gdma_ll_tx_enable_descriptor_burst(group->hal.dev, pair->pair_id, true);
+#if SOC_GDMA_SUPPORT_PSRAM
+        gdma_ll_tx_set_block_size_psram(group->hal.dev, pair->pair_id, block_size_index);
+#endif // #if SOC_GDMA_SUPPORT_PSRAM
+    } else {
+        // RX channel burst mode depends on specific data alignment
+        en_burst = sram_alignment >= 4;
+        gdma_ll_rx_enable_data_burst(group->hal.dev, pair->pair_id, en_burst);
+        gdma_ll_rx_enable_descriptor_burst(group->hal.dev, pair->pair_id, en_burst);
+#if SOC_GDMA_SUPPORT_PSRAM
+        gdma_ll_rx_set_block_size_psram(group->hal.dev, pair->pair_id, block_size_index);
+#endif // #if SOC_GDMA_SUPPORT_PSRAM
+    }
+
+    dma_chan->sram_alignment = sram_alignment;
+    dma_chan->psram_alignment = psram_alignment;
+    ESP_LOGD(TAG, "%s channel (%d,%d), (%zu:%zu) bytes aligned, burst %s", dma_chan->direction == GDMA_CHANNEL_DIRECTION_TX ? "tx" : "rx",
+             group->group_id, pair->pair_id, sram_alignment, psram_alignment, en_burst ? "enabled" : "disabled");
+err:
+    return ret;
+}
+
 esp_err_t gdma_apply_strategy(gdma_channel_handle_t dma_chan, const gdma_strategy_config_t *config)
 {
    esp_err_t ret = ESP_OK;
--- a/components/driver/include/esp_private/gdma.h
+++ b/components/driver/include/esp_private/gdma.h
@ -59,10 +59,23 @@ typedef struct {
    gdma_channel_handle_t sibling_chan; /*!< DMA sibling channel handle (NULL means having sibling is not necessary) */
    gdma_channel_direction_t direction; /*!< DMA channel direction */
    struct {
-        int reserve_sibling: 1;   /*!< If set, DMA channel allocator would prefer to allocate new channel in a new pair, and reserve sibling channel for future use */
+        int reserve_sibling: 1; /*!< If set, DMA channel allocator would prefer to allocate new channel in a new pair, and reserve sibling channel for future use */
    } flags;
 } gdma_channel_alloc_config_t;

+/**
+ * @brief GDMA transfer ability
+ *
+ * @note The alignment set in this structure is **not** a guarantee that gdma driver will take care of the nonalignment cases.
+ *       Actually the GDMA driver has no knowledge about the DMA buffer (address and size) used by upper layer.
+ *       So it's the responsibility of the **upper layer** to take care of the buffer address and size.
+ *
+ */
+typedef struct {
+    size_t sram_trans_align;  /*!< DMA transfer alignment for memory in SRAM, in bytes. The driver enables/disables burst mode based on this value. 0 means no alignment is required */
+    size_t psram_trans_align; /*!< DMA transfer alignment for memory in PSRAM, in bytes. The driver sets proper burst block size based on the alignment value. 0 means no alignment is required */
+} gdma_transfer_ability_t;
+
 /**
 * @brief Type of GDMA event data
 *
@ -80,6 +93,9 @@ typedef struct {
 * @param event_data GDMA event data
 * @param user_data User registered data from `gdma_register_tx_event_callbacks` or `gdma_register_rx_event_callbacks`
 *
+ * @return Whether a task switch is needed after the callback function returns,
+ *         this is usually due to the callback wakes up some high priority task.
+ *
 */
 typedef bool (*gdma_event_callback_t)(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data);

@ -172,6 +188,18 @@ esp_err_t gdma_connect(gdma_channel_handle_t dma_chan, gdma_trigger_t trig_perip
 */
 esp_err_t gdma_disconnect(gdma_channel_handle_t dma_chan);

+/**
+ * @brief Set DMA channel transfer ability
+ *
+ * @param[in] dma_chan GDMA channel handle, allocated by `gdma_new_channel`
+ * @param[in] ability Transfer ability, e.g. alignment
+ * @return
+ *      - ESP_OK: Set DMA channel transfer ability successfully
+ *      - ESP_ERR_INVALID_ARG: Set DMA channel transfer ability failed because of invalid argument
+ *      - ESP_FAIL: Set DMA channel transfer ability failed because of other error
+ */
+esp_err_t gdma_set_transfer_ability(gdma_channel_handle_t dma_chan, const gdma_transfer_ability_t *ability);
+
 /**
 * @brief Apply channel strategy for GDMA channel
 *
--- a/components/esp_hw_support/esp_async_memcpy.c
+++ b/components/esp_hw_support/esp_async_memcpy.c
@ -11,6 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+#include <sys/param.h>
 #include "freertos/FreeRTOS.h"
 #include "freertos/semphr.h"
 #include "hal/dma_types.h"
@ -22,6 +24,8 @@

 static const char *TAG = "async_memcpy";

+#define ALIGN_DOWN(val, align)  ((val) & ~((align) - 1))
+
 /**
 * @brief Type of async mcp stream
 *        mcp stream inherits DMA descriptor, besides that, it has a callback function member
@ -43,7 +47,8 @@ typedef struct async_memcpy_context_t {
    dma_descriptor_t *tx_desc; // pointer to the next free TX descriptor
    dma_descriptor_t *rx_desc; // pointer to the next free RX descriptor
    dma_descriptor_t *next_rx_desc_to_check; // pointer to the next RX descriptor to recycle
-    uint32_t max_stream_num;            // maximum number of streams
+    uint32_t max_stream_num;    // maximum number of streams
+    size_t max_dma_buffer_size; // maximum DMA buffer size
    async_memcpy_stream_t *out_streams;    // pointer to the first TX stream
    async_memcpy_stream_t *in_streams;     // pointer to the first RX stream
    async_memcpy_stream_t streams_pool[0]; // stream pool (TX + RX), the size is configured during driver installation
@ -82,9 +87,14 @@ esp_err_t esp_async_memcpy_install(const async_memcpy_config_t *config, async_me
    mcp_hdl->rx_desc = &mcp_hdl->in_streams[0].desc;
    mcp_hdl->next_rx_desc_to_check = &mcp_hdl->in_streams[0].desc;
    mcp_hdl->spinlock = (portMUX_TYPE)portMUX_INITIALIZER_UNLOCKED;
+    mcp_hdl->mcp_impl.sram_trans_align = config->sram_trans_align;
+    mcp_hdl->mcp_impl.psram_trans_align = config->psram_trans_align;
+    size_t trans_align = MAX(config->sram_trans_align, config->psram_trans_align);
+    mcp_hdl->max_dma_buffer_size = trans_align ? ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, trans_align) : DMA_DESCRIPTOR_BUFFER_MAX_SIZE;

    // initialize implementation layer
-    async_memcpy_impl_init(&mcp_hdl->mcp_impl);
+    ret = async_memcpy_impl_init(&mcp_hdl->mcp_impl);
+    ESP_GOTO_ON_ERROR(ret, err, TAG, "DMA M2M init failed");

    *asmcp = mcp_hdl;

@ -121,14 +131,14 @@ static int async_memcpy_prepare_receive(async_memcpy_t asmcp, void *buffer, size
    dma_descriptor_t *start = desc;
    dma_descriptor_t *end = desc;

-    while (size > DMA_DESCRIPTOR_BUFFER_MAX_SIZE) {
+    while (size > asmcp->max_dma_buffer_size) {
        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
            desc->dw0.suc_eof = 0;
-            desc->dw0.size = DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            desc->dw0.size = asmcp->max_dma_buffer_size;
            desc->buffer = &buf[prepared_length];
            desc = desc->next; // move to next descriptor
-            prepared_length += DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
-            size -= DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            prepared_length += asmcp->max_dma_buffer_size;
+            size -= asmcp->max_dma_buffer_size;
        } else {
            // out of RX descriptors
            goto _exit;
@ -162,15 +172,15 @@ static int async_memcpy_prepare_transmit(async_memcpy_t asmcp, void *buffer, siz
    dma_descriptor_t *start = desc;
    dma_descriptor_t *end = desc;

-    while (len > DMA_DESCRIPTOR_BUFFER_MAX_SIZE) {
+    while (len > asmcp->max_dma_buffer_size) {
        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
            desc->dw0.suc_eof = 0; // not the end of the transaction
-            desc->dw0.size = DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
-            desc->dw0.length = DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            desc->dw0.size = asmcp->max_dma_buffer_size;
+            desc->dw0.length = asmcp->max_dma_buffer_size;
            desc->buffer = &buf[prepared_length];
            desc = desc->next; // move to next descriptor
-            prepared_length += DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
-            len -= DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            prepared_length += asmcp->max_dma_buffer_size;
+            len -= asmcp->max_dma_buffer_size;
        } else {
            // out of TX descriptors
            goto _exit;
@ -222,14 +232,20 @@ esp_err_t esp_async_memcpy(async_memcpy_t asmcp, void *dst, void *src, size_t n,
    size_t rx_prepared_size = 0;
    size_t tx_prepared_size = 0;
    ESP_GOTO_ON_FALSE(asmcp, ESP_ERR_INVALID_ARG, err, TAG, "mcp handle can't be null");
-    ESP_GOTO_ON_FALSE(async_memcpy_impl_is_buffer_address_valid(&asmcp->mcp_impl, src, dst), ESP_ERR_INVALID_ARG, err, TAG, "buffer address not valid");
-    ESP_GOTO_ON_FALSE(n <= DMA_DESCRIPTOR_BUFFER_MAX_SIZE * asmcp->max_stream_num, ESP_ERR_INVALID_ARG, err, TAG, "buffer size too large");
+    ESP_GOTO_ON_FALSE(async_memcpy_impl_is_buffer_address_valid(&asmcp->mcp_impl, src, dst), ESP_ERR_INVALID_ARG, err, TAG, "buffer address not valid: %p -> %p", src, dst);
+    ESP_GOTO_ON_FALSE(n <= asmcp->max_dma_buffer_size * asmcp->max_stream_num, ESP_ERR_INVALID_ARG, err, TAG, "buffer size too large");
+    if (asmcp->mcp_impl.sram_trans_align) {
+        ESP_GOTO_ON_FALSE(((n & (asmcp->mcp_impl.sram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, err, TAG, "copy size should align to %d bytes", asmcp->mcp_impl.sram_trans_align);
+    }
+    if (asmcp->mcp_impl.psram_trans_align) {
+        ESP_GOTO_ON_FALSE(((n & (asmcp->mcp_impl.psram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, err, TAG, "copy size should align to %d bytes", asmcp->mcp_impl.psram_trans_align);
+    }

    // Prepare TX and RX descriptor
    portENTER_CRITICAL_SAFE(&asmcp->spinlock);
    rx_prepared_size = async_memcpy_prepare_receive(asmcp, dst, n, &rx_start_desc, &rx_end_desc);
    tx_prepared_size = async_memcpy_prepare_transmit(asmcp, src, n, &tx_start_desc, &tx_end_desc);
-    if ((rx_prepared_size == n) && (tx_prepared_size == n)) {
+    if (rx_start_desc && tx_start_desc && (rx_prepared_size == n) && (tx_prepared_size == n)) {
        // register user callback to the last descriptor
        async_memcpy_stream_t *mcp_stream = __containerof(rx_end_desc, async_memcpy_stream_t, desc);
        mcp_stream->cb = cb_isr;
--- a/components/esp_hw_support/include/esp_async_memcpy.h
+++ b/components/esp_hw_support/include/esp_async_memcpy.h
@ -54,8 +54,10 @@ typedef bool (*async_memcpy_isr_cb_t)(async_memcpy_t mcp_hdl, async_memcpy_event
 *
 */
 typedef struct {
-    uint32_t backlog; /*!< Maximum number of streams that can be handled simultaneously */
-    uint32_t flags;   /*!< Extra flags to control async memcpy feature */
+    uint32_t backlog;          /*!< Maximum number of streams that can be handled simultaneously */
+    size_t sram_trans_align;   /*!< DMA transfer alignment (both in size and address) for SRAM memory */
+    size_t psram_trans_align;  /*!< DMA transfer alignment (both in size and address) for PSRAM memory */
+    uint32_t flags;            /*!< Extra flags to control async memcpy feature */
 } async_memcpy_config_t;

 /**
@ -63,9 +65,11 @@ typedef struct {
 *
 */
 #define ASYNC_MEMCPY_DEFAULT_CONFIG() \
-    {                              \
-        .backlog = 8,              \
-        .flags = 0,                \
+    {                                 \
+        .backlog = 8,                 \
+        .sram_trans_align = 0,        \
+        .psram_trans_align = 0,       \
+        .flags = 0,                   \
    }

 /**
--- a/components/esp_hw_support/port/async_memcpy_impl_gdma.c
+++ b/components/esp_hw_support/port/async_memcpy_impl_gdma.c
@ -61,9 +61,21 @@ esp_err_t async_memcpy_impl_init(async_memcpy_impl_t *impl)

    gdma_strategy_config_t strategy_config = {
        .auto_update_desc = true,
-        .owner_check = true
+        .owner_check = true,
    };

+    gdma_transfer_ability_t transfer_ability = {
+        .sram_trans_align = impl->sram_trans_align,
+        .psram_trans_align = impl->psram_trans_align,
+    };
+    ret = gdma_set_transfer_ability(impl->tx_channel, &transfer_ability);
+    if (ret != ESP_OK) {
+        goto err;
+    }
+    ret = gdma_set_transfer_ability(impl->rx_channel, &transfer_ability);
+    if (ret != ESP_OK) {
+        goto err;
+    }
    gdma_apply_strategy(impl->tx_channel, &strategy_config);
    gdma_apply_strategy(impl->rx_channel, &strategy_config);

@ -108,5 +120,15 @@ esp_err_t async_memcpy_impl_restart(async_memcpy_impl_t *impl)

 bool async_memcpy_impl_is_buffer_address_valid(async_memcpy_impl_t *impl, void *src, void *dst)
 {
-    return true;
+    bool valid = true;
+    if (esp_ptr_external_ram(dst)) {
+        if (impl->psram_trans_align) {
+            valid = valid && (((intptr_t)dst & (impl->psram_trans_align - 1)) == 0);
+        }
+    } else {
+        if (impl->sram_trans_align) {
+            valid = valid && (((intptr_t)dst & (impl->sram_trans_align - 1)) == 0);
+        }
+    }
+    return valid;
 }
--- a/components/esp_hw_support/port/include/esp_async_memcpy_impl.h
+++ b/components/esp_hw_support/port/include/esp_async_memcpy_impl.h
@ -46,6 +46,8 @@ typedef struct {
    gdma_channel_handle_t rx_channel;
 #endif
    intptr_t rx_eof_addr;
+    size_t sram_trans_align;
+    size_t psram_trans_align;
    bool isr_need_yield;      // if current isr needs a yield for higher priority task
 } async_memcpy_impl_t;

--- a/components/esp_hw_support/test/test_async_memcpy.c
+++ b/components/esp_hw_support/test/test_async_memcpy.c
@ -12,37 +12,75 @@
 #include "ccomp_timer.h"
 #include "esp_async_memcpy.h"
 #include "soc/soc_caps.h"
+#include "hal/dma_types.h"

 #if SOC_CP_DMA_SUPPORTED || SOC_GDMA_SUPPORTED

 #define ALIGN_UP(addr, align) (((addr) + (align)-1) & ~((align)-1))
+#define ALIGN_DOWN(size, align)  ((size) & ~((align) - 1))

-static void async_memcpy_setup_testbench(uint32_t seed, uint32_t *buffer_size, uint8_t **src_buf, uint8_t **dst_buf, uint8_t **from_addr, uint8_t **to_addr, uint32_t align)
+typedef struct {
+    uint32_t seed;
+    uint32_t buffer_size;
+    uint8_t *src_buf;
+    uint8_t *dst_buf;
+    uint8_t *from_addr;
+    uint8_t *to_addr;
+    uint32_t align;
+    uint32_t offset;
+    bool src_in_psram;
+    bool dst_in_psram;
+} memcpy_testbench_context_t;
+
+static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_context)
 {
-    srand(seed);
+    srand(test_context->seed);
    printf("allocating memory buffer...\r\n");
-    // memory copy from/to PSRAM is not allowed
-    *src_buf = heap_caps_malloc(*buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
-    *dst_buf = heap_caps_calloc(1, *buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
-
-    TEST_ASSERT_NOT_NULL_MESSAGE(*src_buf, "allocate source buffer failed");
-    TEST_ASSERT_NOT_NULL_MESSAGE(*dst_buf, "allocate destination buffer failed");
-
-    *from_addr = (uint8_t *)ALIGN_UP((uint32_t)(*src_buf), 4);
-    *to_addr = (uint8_t *)ALIGN_UP((uint32_t)(*dst_buf), 4);
-    uint8_t gap = MAX(*from_addr - *src_buf, *to_addr - *dst_buf);
-    *buffer_size -= gap;
-
-    *from_addr += align;
-    *to_addr += align;
-    *buffer_size -= align;
-
-    printf("...size %d Bytes, src@%p, dst@%p\r\n", *buffer_size, *from_addr, *to_addr);
-
-    printf("fill src buffer with random data\r\n");
-    for (int i = 0; i < *buffer_size; i++) {
-        (*from_addr)[i] = rand() % 256;
+    uint32_t buffer_size = test_context->buffer_size;
+    uint8_t *src_buf = NULL;
+    uint8_t *dst_buf = NULL;
+    uint8_t *from_addr = NULL;
+    uint8_t *to_addr = NULL;
+#if CONFIG_SPIRAM && SOC_GDMA_SUPPORT_PSRAM
+    if (test_context->src_in_psram) {
+        src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_SPIRAM);
+    } else {
+        src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
    }
+    if (test_context->dst_in_psram) {
+        dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_SPIRAM);
+    } else {
+        dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    }
+#else
+    src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+#endif
+    TEST_ASSERT_NOT_NULL_MESSAGE(src_buf, "allocate source buffer failed");
+    TEST_ASSERT_NOT_NULL_MESSAGE(dst_buf, "allocate destination buffer failed");
+    // address alignment
+    from_addr = (uint8_t *)ALIGN_UP((uint32_t)(src_buf), test_context->align);
+    to_addr = (uint8_t *)ALIGN_UP((uint32_t)(dst_buf), test_context->align);
+    uint8_t gap = MAX(from_addr - src_buf, to_addr - dst_buf);
+    buffer_size -= gap;
+    // size alignment
+    buffer_size = ALIGN_DOWN(buffer_size, test_context->align);
+    // adding extra offset
+    from_addr += test_context->offset;
+    to_addr += test_context->offset;
+    buffer_size -= test_context->offset;
+
+    printf("...size %d Bytes, src@%p, dst@%p\r\n", buffer_size, from_addr, to_addr);
+    printf("fill src buffer with random data\r\n");
+    for (int i = 0; i < buffer_size; i++) {
+        from_addr[i] = rand() % 256;
+    }
+    // return value
+    test_context->buffer_size = buffer_size;
+    test_context->src_buf = src_buf;
+    test_context->dst_buf = dst_buf;
+    test_context->from_addr = from_addr;
+    test_context->to_addr = to_addr;
 }

 static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t buffer_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr)
@ -91,18 +129,18 @@ TEST_CASE("memory copy by DMA one by one", "[async mcp]")
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

    uint32_t test_buffer_len[] = {256, 512, 1024, 2048, 4096, 5011};
-    uint8_t *sbuf = NULL;
-    uint8_t *dbuf = NULL;
-    uint8_t *from = NULL;
-    uint8_t *to = NULL;
+    memcpy_testbench_context_t test_context = {
+        .align = 4,
+    };

    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
        // Test different align edge
-        for (int align = 0; align < 4; align++) {
-            async_memcpy_setup_testbench(i, &test_buffer_len[i], &sbuf, &dbuf, &from, &to, align);
-            TEST_ESP_OK(esp_async_memcpy(driver, to, from, test_buffer_len[i], NULL, NULL));
-            async_memcpy_verify_and_clear_testbench(i, test_buffer_len[i], sbuf, dbuf, from, to);
-
+        for (int off = 0; off < 4; off++) {
+            test_context.buffer_size = test_buffer_len[i];
+            test_context.seed = i;
+            async_memcpy_setup_testbench(&test_context);
+            TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, NULL, NULL));
+            async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
            vTaskDelay(pdMS_TO_TICKS(100));
        }
    }
@ -117,86 +155,177 @@ TEST_CASE("memory copy by DMA on the fly", "[async mcp]")
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

    uint32_t test_buffer_len[] = {512, 1024, 2048, 4096, 5011};
-    uint8_t *sbufs[] = {0, 0, 0, 0, 0};
-    uint8_t *dbufs[] = {0, 0, 0, 0, 0};
-    uint8_t *froms[] = {0, 0, 0, 0, 0};
-    uint8_t *tos[] = {0, 0, 0, 0, 0};
+    memcpy_testbench_context_t test_context[] = {
+        {.align = 4}, {.align = 4}, {.align = 4}, {.align = 4}, {.align = 4},
+    };

    // Aligned case
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_setup_testbench(i, &test_buffer_len[i], &sbufs[i], &dbufs[i], &froms[i], &tos[i], 0);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        test_context[i].seed = i;
+        test_context[i].buffer_size = test_buffer_len[i];
+        async_memcpy_setup_testbench(&test_context[i]);
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, tos[i], froms[i], test_buffer_len[i], NULL, NULL));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].buffer_size, NULL, NULL));
    }
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_buffer_len[i], sbufs[i], dbufs[i], froms[i], tos[i]);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        async_memcpy_verify_and_clear_testbench(i, test_context[i].buffer_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
    }

    // Non-aligned case
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_setup_testbench(i, &test_buffer_len[i], &sbufs[i], &dbufs[i], &froms[i], &tos[i], 3);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        test_context[i].seed = i;
+        test_context[i].buffer_size = test_buffer_len[i];
+        test_context[i].offset = 3;
+        async_memcpy_setup_testbench(&test_context[i]);
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, tos[i], froms[i], test_buffer_len[i], NULL, NULL));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].buffer_size, NULL, NULL));
    }
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_buffer_len[i], sbufs[i], dbufs[i], froms[i], tos[i]);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        async_memcpy_verify_and_clear_testbench(i, test_context[i].buffer_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
    }

    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
 }

-#define TEST_ASYNC_MEMCPY_BENCH_COUNTS (16)
-static uint32_t test_async_memcpy_bench_len = 4095;
-static int count = 0;
+#define TEST_ASYNC_MEMCPY_BENCH_COUNTS   (16)
+static int s_count = 0;

 static IRAM_ATTR bool test_async_memcpy_isr_cb(async_memcpy_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
 {
    SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args;
    BaseType_t high_task_wakeup = pdFALSE;
-    count++;
-    if (count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) {
+    s_count++;
+    if (s_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) {
        xSemaphoreGiveFromISR(sem, &high_task_wakeup);
    }
    return high_task_wakeup == pdTRUE;
 }

-TEST_CASE("memory copy by DMA with callback", "[async mcp]")
+static void memcpy_performance_test(uint32_t buffer_size)
 {
    SemaphoreHandle_t sem = xSemaphoreCreateBinary();

    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    config.backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS;
+    config.backlog = (buffer_size / DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1) * TEST_ASYNC_MEMCPY_BENCH_COUNTS;
+    config.sram_trans_align = 4;   // at least 4 bytes aligned for SRAM transfer
+    config.psram_trans_align = 64; // at least 64 bytes aligned for PSRAM transfer
    async_memcpy_t driver = NULL;
+    int64_t elapse_us = 0;
+    float throughput = 0.0;
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

-    uint8_t *sbuf = NULL;
-    uint8_t *dbuf = NULL;
-    uint8_t *from = NULL;
-    uint8_t *to = NULL;
-
-    async_memcpy_setup_testbench(0, &test_async_memcpy_bench_len, &sbuf, &dbuf, &from, &to, 0);
-    count = 0;
+    // 1. SRAM->SRAM
+    memcpy_testbench_context_t test_context = {
+        .align = config.psram_trans_align,
+        .buffer_size = buffer_size,
+        .src_in_psram = false,
+        .dst_in_psram = false,
+    };
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, to, from, test_async_memcpy_bench_len, test_async_memcpy_isr_cb, sem));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
    }
-
    // wait for done semaphore
    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
-    esp_rom_printf("memcpy %d Bytes data by HW costs %lldus\r\n", test_async_memcpy_bench_len, ccomp_timer_stop() / TEST_ASYNC_MEMCPY_BENCH_COUNTS);
-
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        memcpy(to, from, test_async_memcpy_bench_len);
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
    }
-    esp_rom_printf("memcpy %d Bytes data by SW costs %lldus\r\n", test_async_memcpy_bench_len, ccomp_timer_stop() / TEST_ASYNC_MEMCPY_BENCH_COUNTS);
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);

-    async_memcpy_verify_and_clear_testbench(0, test_async_memcpy_bench_len, sbuf, dbuf, from, to);
+#if CONFIG_SPIRAM && SOC_GDMA_SUPPORT_PSRAM
+    // 2. PSRAM->PSRAM
+    test_context.src_in_psram = true;
+    test_context.dst_in_psram = true;
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+    }
+    // wait for done semaphore
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
+    }
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+
+    // 3. PSRAM->SRAM
+    test_context.src_in_psram = true;
+    test_context.dst_in_psram = false;
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+    }
+    // wait for done semaphore
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
+    }
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+
+    // 4. SRAM->PSRAM
+    test_context.src_in_psram = false;
+    test_context.dst_in_psram = true;
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+    }
+    // wait for done semaphore
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
+    }
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+#endif

    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
    vSemaphoreDelete(sem);
 }

+TEST_CASE("memory copy performance test 40KB", "[async mcp]")
+{
+    memcpy_performance_test(40 * 1024);
+}
+
+TEST_CASE("memory copy performance test 4KB", "[async mcp]")
+{
+    memcpy_performance_test(4 * 1024);
+}
+
 #endif //SOC_CP_DMA_SUPPORTED || SOC_GDMA_SUPPORTED
--- a/components/mbedtls/port/crypto_shared_gdma/esp_crypto_shared_gdma.c
+++ b/components/mbedtls/port/crypto_shared_gdma/esp_crypto_shared_gdma.c
@ -37,7 +37,7 @@ static inline esp_err_t crypto_shared_gdma_new_channel(gdma_channel_alloc_config
    esp_err_t ret;
    int time_waited_ms = 0;

-    while(1) {
+    while (1) {
        ret = gdma_new_channel(channel_config, channel);

        if (ret == ESP_OK) {
@ -58,14 +58,12 @@ static inline esp_err_t crypto_shared_gdma_new_channel(gdma_channel_alloc_config
 /* Initialize external memory specific DMA configs */
 static void esp_crypto_shared_dma_init_extmem(void)
 {
-    int tx_ch_id = 0;
-    int rx_ch_id = 0;
-
-    gdma_get_channel_id(tx_channel, &tx_ch_id);
-    gdma_get_channel_id(rx_channel, &rx_ch_id);
-
-    gdma_ll_tx_set_block_size_psram(&GDMA, tx_ch_id, GDMA_LL_EXT_MEM_BK_SIZE_16B);
-    gdma_ll_rx_set_block_size_psram(&GDMA, rx_ch_id, GDMA_LL_EXT_MEM_BK_SIZE_16B);
+    gdma_transfer_ability_t transfer_ability = {
+        .sram_trans_align = 4,
+        .psram_trans_align = 16,
+    };
+    gdma_set_transfer_ability(tx_channel, &transfer_ability);
+    gdma_set_transfer_ability(rx_channel, &transfer_ability);
 }
 #endif //SOC_GDMA_SUPPORT_PSRAM

@ -137,7 +135,7 @@ esp_err_t esp_crypto_shared_gdma_start(const lldesc_t *input, const lldesc_t *ou
        return ESP_ERR_INVALID_ARG;
    }

-  /* tx channel is reset by gdma_connect(), also reset rx to ensure a known state */
+    /* tx channel is reset by gdma_connect(), also reset rx to ensure a known state */
    gdma_get_channel_id(tx_channel, &rx_ch_id);
    gdma_ll_rx_reset_channel(&GDMA, rx_ch_id);

--- a/docs/en/api-reference/system/async_memcpy.rst
+++ b/docs/en/api-reference/system/async_memcpy.rst
@ -22,6 +22,8 @@ Configure and Install driver

 Driver configuration is described in :cpp:type:`async_memcpy_config_t`:
 :cpp:member:`backlog`: This is used to configured the maximum number of DMA operation that can be working at the background at the same time.
+:cpp:member:`sram_trans_align`: Declare SRAM alignment for both data address and copy size, set to zero if the data has no restriction in alignment. If set to a quadruple value (i.e. 4X), the driver will enable the burst mode internally, which is helpful for some performance related application.
+:cpp:member:`psram_trans_align`: Declare PSRAM alignment for both data address and copy size. User has to give it a valid value (only 16, 32, 64 are supported) if the destination of memcpy is located in PSRAM. The default alignment (i.e. 16) will be applied if it's set to zero. Internally, the driver configures the size of block used by DMA to access PSRAM, according to the alignment.
 :cpp:member:`flags`: This is used to enable some special driver features.

 :c:macro:`ASYNC_MEMCPY_DEFAULT_CONFIG` provides a default configuration, which specifies the backlog to 8.
--- a/tools/unit-test-app/components/test_utils/include/test_utils.h
+++ b/tools/unit-test-app/components/test_utils/include/test_utils.h
@ -73,7 +73,7 @@ extern "C" {


 /* @brief macro to print IDF performance
- * @param mode :        performance item name. a string pointer.
+ * @param item :        performance item name. a string pointer.
 * @param value_fmt:    print format and unit of the value, for example: "%02fms", "%dKB"
 * @param value :       the performance value.
 */